#Research Question:

What individual, behavioral, and health factors are associated with physical health burden among U.S. adults? How do findings compare when the outcome is modeled as the number of 2 physically unhealthy days (continuous) versus as an indicator of frequent physical distress (binary: 14 or more days in the past 30)?

#Part 0: Data Preparation Step 1: Download Packages

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)
library(janitor)
library(leaps)

Step 1: Import Data

brfss_2023 <- read_xpt(
  "~/Documents/HEPI553/data/LLCP2023.XPT"

  ) |>
  clean_names()

433,323 observations and 350 variables

Step 2: Recode All Variables

brfss_clean <- brfss_2023 |>
  mutate(
    
    # Mental health days
    menthlth_days = case_when(
      menthlth == 88 ~ 0,
      menthlth %in% c(77, 99) ~ NA_real_,
      menthlth >= 1 & menthlth <= 30  ~ as.numeric(menthlth), 
      TRUE ~ NA_real_
    ),
    
    # Physical unhealth days
    physhlth_days = case_when(
      physhlth == 88 ~ 0,
      physhlth %in% c(77, 99) ~ NA_real_,
      physhlth >= 1 & physhlth <= 30  ~ as.numeric(physhlth),
      TRUE ~ NA_real_
    ),
    
    # BMI
    bmi = case_when(
      bmi5 == 9999 ~ NA_real_,
      TRUE ~ as.numeric(bmi5) / 100
    ),
    
    # Sex
    sex = factor(case_when(
      sexvar == 1 ~ "Male",
      sexvar == 2 ~ "Female",
      TRUE ~ NA_character_
    ),
    levels = c("Male", "Female")),
    
    # Exercise
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      exerany2 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),
    levels = c("No","Yes")),
    
    #Depression
    depression = factor(case_when(
      addepev3 == 1 ~ "Yes",
      addepev3 == 2 ~ "No",
      addepev3 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),
    
    levels = c("No","Yes")),
  
    # Age group
    age_group = factor(case_when(
      ageg5yr %in% c(1,2,3) ~ "18-34",
      ageg5yr %in% c(4,5,6) ~ "35-49",
      ageg5yr %in% c(7,8,9) ~ "50-64",
      ageg5yr %in% c(10,11,12,13) ~ "65+",
      TRUE ~ NA_character_
    ),
      
      levels = c("18-34","35-49","50-64","65+")),
     
    # Income
    income = factor(case_when(
    incomg1 %in% c(1,2) ~ "Less than $25k",
    incomg1 %in% c(3,4) ~ "$25-$49",
    incomg1 %in% c(5,6) ~ "$50-$99",
    incomg1 == 7 ~ "$100k+",
    incomg1 == 9 ~ NA_character_,
     TRUE ~ NA_character_
    ),
    
    levels = c("Less than $25k", "$25-$49", "$50-$99", "$100k+")),
        
    # Education
    education = factor(case_when(
      educa %in% c(1, 2,3) ~ "Less than HS",
      educa == 4 ~ "HS/GED",
      educa == 5 ~ "Some college",
      educa == 6 ~ "College graduate",
      educa == 9 ~ NA_character_, 
      TRUE ~ NA_character_
    ), 
     levels = c("Less than HS","HS/GED","Some college","College graduate")),
    
    # Smoking
    smoking = factor(case_when(
      smoker3 %in% c(1, 2) ~ "Current smoker",
      smoker3 == 3 ~ "Former smoker",
      smoker3 == 4 ~ "Never smoker",
      smoker3 == 9 ~ NA_character_
    ), 
    levels = c("Current smoker","Former smoker","Never smoker")),
    
    # General Health Status 
    gen_health = factor (case_when(
      genhlth %in% c(1,2) ~ "Excellent/Very Good",
      genhlth == 3 ~ "Good",
      genhlth %in% c(4,5) ~ "Fair/Poor",
      genhlth %in% c(7, 9) ~ NA_character_,
       TRUE ~ NA_character_
    ),
    
    levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
    
    #Marital Status 
    marital = factor(case_when(
      marital %in% c(1,6) ~ "Married/Partnered",
      marital %in% c(2, 3, 4) ~ "Previously married",
      marital == 5 ~ "Never married",
      marital == 9 ~ NA_character_,
       TRUE ~ NA_character_
    ),
    
    levels = c("Married/Partnered", "Previously married", "Never married")),
    
  ) |> 
  
select(menthlth_days, physhlth_days, bmi, sex,depression, exercise, age_group, income, education, smoking, gen_health, marital)

Step 3: Analytic Dataset and Descriptive Summary

Report Missing Observations

cat("Missing values in analytic dataset:", sum(!complete.cases(brfss_clean)), "\n")
## Missing values in analytic dataset: 125095
cat("All models are fit on the same n =", nrow(brfss_clean), "observations.\n")
## All models are fit on the same n = 433323 observations.
cat("Missing menthlth_days:", sum(is.na(brfss_clean$menthlth_days)), 
    "(", round(mean(is.na(brfss_clean$menthlth_days)) * 100, 1), "%)\n")
## Missing menthlth_days: 8108 ( 1.9 %)
cat("Missing physhlth_days:", sum(is.na(brfss_clean$physhlth_days)), 
    "(", round(mean(is.na(brfss_clean$physhlth_days)) * 100, 1), "%)\n")
## Missing physhlth_days: 10785 ( 2.5 %)
cat("Missing income:", sum(is.na(brfss_clean$income)), 
    "(", round(mean(is.na(brfss_clean$income)) * 100, 1), "%)\n")
## Missing income: 86623 ( 20 %)

Drop Missing

set.seed(1220)
brfss_analytic_new <- brfss_clean |>
 drop_na() |>
 slice_sample(n = 8000)

Reporting Analytic Sample

nrow(brfss_analytic_new)
## [1] 8000

Descriptive Table

brfss_analytic_new |>
  select(menthlth_days, physhlth_days, age_group, sex, education, depression, exercise, smoking, bmi, income, gen_health, marital) |>
  tbl_summary(
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      age_group     ~ "Age (years)",
      education     ~ "Education level",
      depression    ~ "Ever told depressive disorder",
      exercise      ~ "Any physical activity (past 30 days)",
      smoking       ~ "Smoker Status",
      bmi           ~ "BMI Status",
      income        ~ "Income Status",
      gen_health    ~ "General health status",
      marital ~ "Marital status",
      sex     ~ "Sex"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023**") |>
  as_gt()
Table 1. Descriptive Statistics — BRFSS 2023
Characteristic N N = 8,0001
Mentally unhealthy days (past 30) 8,000 4.3 (8.2)
Physically unhealthy days (past 30) 8,000 4.3 (8.7)
Age (years) 8,000
    18-34
1,294 (16%)
    35-49
1,657 (21%)
    50-64
2,109 (26%)
    65+
2,940 (37%)
Sex 8,000
    Male
3,971 (50%)
    Female
4,029 (50%)
Education level 8,000
    Less than HS
391 (4.9%)
    HS/GED
1,887 (24%)
    Some college
2,115 (26%)
    College graduate
3,607 (45%)
Ever told depressive disorder 8,000 1,776 (22%)
Any physical activity (past 30 days) 8,000 6,157 (77%)
Smoker Status 8,000
    Current smoker
962 (12%)
    Former smoker
2,230 (28%)
    Never smoker
4,808 (60%)
BMI Status 8,000 28.7 (6.5)
Income Status 8,000
    Less than $25k
1,090 (14%)
    $25-$49
1,931 (24%)
    $50-$99
4,297 (54%)
    $100k+
682 (8.5%)
General health status 8,000
    Excellent/Very Good
3,926 (49%)
    Good
2,648 (33%)
    Fair/Poor
1,426 (18%)
Marital status 8,000
    Married/Partnered
4,582 (57%)
    Previously married
2,050 (26%)
    Never married
1,368 (17%)
1 Mean (SD); n (%)

Part 1: Model Selection for Linear Regression

Step 1: Fit the Maximum Model

Linear Regression Model

model_maxA <- lm(physhlth_days ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + income + gen_health + marital, 
             data = brfss_analytic_new)

summary(model_maxA)
## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + age_group + sex + 
##     education + depression + exercise + smoking + bmi + income + 
##     gen_health + marital, data = brfss_analytic_new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.795  -2.839  -1.145   0.690  31.047 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.66546    0.64506   4.132 3.63e-05 ***
## menthlth_days              0.18362    0.01135  16.172  < 2e-16 ***
## age_group35-49             0.76935    0.28365   2.712 0.006696 ** 
## age_group50-64             1.40108    0.27874   5.026 5.11e-07 ***
## age_group65+               1.72725    0.27852   6.202 5.87e-10 ***
## sexFemale                  0.23449    0.16389   1.431 0.152538    
## educationHS/GED            0.29371    0.40089   0.733 0.463806    
## educationSome college      1.00198    0.40273   2.488 0.012867 *  
## educationCollege graduate  1.05262    0.40445   2.603 0.009269 ** 
## depressionYes              0.77536    0.21528   3.602 0.000318 ***
## exerciseYes               -1.93289    0.20408  -9.471  < 2e-16 ***
## smokingFormer smoker      -0.02637    0.28445  -0.093 0.926150    
## smokingNever smoker       -0.46414    0.26617  -1.744 0.081241 .  
## bmi                       -0.01877    0.01287  -1.459 0.144638    
## income$25-$49             -1.08440    0.27627  -3.925 8.74e-05 ***
## income$50-$99             -1.52385    0.27607  -5.520 3.50e-08 ***
## income$100k+              -1.31580    0.39821  -3.304 0.000956 ***
## gen_healthGood             1.40982    0.18635   7.565 4.30e-14 ***
## gen_healthFair/Poor       10.06509    0.25239  39.879  < 2e-16 ***
## maritalPreviously married -0.26701    0.20679  -1.291 0.196680    
## maritalNever married      -0.41244    0.24899  -1.656 0.097666 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared:  0.3258, Adjusted R-squared:  0.3242 
## F-statistic: 192.8 on 20 and 7979 DF,  p-value: < 2.2e-16
glance(model_maxA) |>
  dplyr::select(r.squared, adj.r.squared, sigma, AIC, BIC, df.residual) |>
  mutate(across(everything(), \(x) round(x, 3))) |>
  kable(caption = "Maximum Model: Fit Statistics") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Maximum Model: Fit Statistics
r.squared adj.r.squared sigma AIC BIC df.residual
0.326 0.324 7.112 54115.05 54268.77 7979

Step 2: Best Subsets Regression

best_subsets <- regsubsets(
  physhlth_days ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + income + gen_health + marital,
  data = brfss_analytic_new,
  nvmax = 12,      # maximum number of variables to consider
  method = "exhaustive"
)

best_summary <- summary(best_subsets)

subset_metrics <- tibble(
  p = 1:length(best_summary$adjr2),
  `Adj. R²` = best_summary$adjr2,
  BIC = best_summary$bic,
  Cp = best_summary$cp
)

ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.max(best_summary$adjr2),
             linetype = "dashed", color = "tomato") +
  labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
  theme_minimal(base_size = 12)

ggplot(subset_metrics, aes(x = p, y = BIC)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.min(best_summary$bic),
             linetype = "dashed", color = "tomato") +
  labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
  theme_minimal(base_size = 12)

In 2-3 sentences, describe what you observe: Do the two criteria agree on the best model size? If not, which criterion favors a simpler model?

The best subset confirms the analysis by using both AIC and BIC models. The adjusted R^2 reaches it maximum at 12 variables and plateaus, while BIC selects a more parsimonious model with 8.6 variables. As shown, the two criteria agree on the best model size since each show the correlation between the outcome and the variables used.

Step 3: Stepwise Selection Methods

Backward elimination

mod_backward <- step(model_maxA, direction = "backward", trace = 1)
## Start:  AIC=31410.04
## physhlth_days ~ menthlth_days + age_group + sex + education + 
##     depression + exercise + smoking + bmi + income + gen_health + 
##     marital
## 
##                 Df Sum of Sq    RSS   AIC
## - marital        2       180 403789 31410
## <none>                       403609 31410
## - sex            1       104 403712 31410
## - bmi            1       108 403716 31410
## - smoking        2       347 403956 31413
## - depression     1       656 404265 31421
## - education      3       876 404485 31421
## - income         3      1550 405158 31435
## - age_group      3      2235 405844 31448
## - exercise       1      4537 408146 31497
## - menthlth_days  1     13230 416839 31666
## - gen_health     2     84670 488279 32930
## 
## Step:  AIC=31409.61
## physhlth_days ~ menthlth_days + age_group + sex + education + 
##     depression + exercise + smoking + bmi + income + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## - bmi            1        98 403887 31410
## <none>                       403789 31410
## - sex            1       102 403891 31410
## - smoking        2       345 404134 31412
## - depression     1       637 404427 31420
## - education      3       875 404665 31421
## - income         3      1388 405177 31431
## - age_group      3      3048 406837 31464
## - exercise       1      4538 408327 31497
## - menthlth_days  1     13195 416984 31665
## - gen_health     2     84707 488496 32929
## 
## Step:  AIC=31409.55
## physhlth_days ~ menthlth_days + age_group + sex + education + 
##     depression + exercise + smoking + income + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944
tidy(mod_backward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Backward Elimination Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Backward Elimination Result (AIC-based)
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 1.8629 0.5225 3.5653 0.0004 0.8387 2.8871
menthlth_days 0.1836 0.0114 16.1748 0.0000 0.1614 0.2059
age_group35-49 0.8374 0.2703 3.0980 0.0020 0.3075 1.3672
age_group50-64 1.4778 0.2583 5.7206 0.0000 0.9714 1.9841
age_group65+ 1.8366 0.2513 7.3099 0.0000 1.3441 2.3292
sexFemale 0.2329 0.1633 1.4262 0.1539 -0.0872 0.5530
educationHS/GED 0.2576 0.4006 0.6431 0.5202 -0.5276 1.0428
educationSome college 0.9636 0.4025 2.3943 0.0167 0.1747 1.7525
educationCollege graduate 1.0310 0.4043 2.5502 0.0108 0.2385 1.8236
depressionYes 0.7471 0.2149 3.4769 0.0005 0.3259 1.1683
exerciseYes -1.9048 0.2031 -9.3793 0.0000 -2.3029 -1.5067
smokingFormer smoker -0.0377 0.2831 -0.1332 0.8940 -0.5926 0.5172
smokingNever smoker -0.4776 0.2647 -1.8038 0.0713 -0.9965 0.0414
income$25-$49 -1.0324 0.2749 -3.7557 0.0002 -1.5713 -0.4936
income$50-$99 -1.3884 0.2658 -5.2239 0.0000 -1.9094 -0.8674
income$100k+ -1.1122 0.3849 -2.8897 0.0039 -1.8666 -0.3577
gen_healthGood 1.3636 0.1839 7.4161 0.0000 1.0032 1.7241
gen_healthFair/Poor 10.0009 0.2486 40.2245 0.0000 9.5136 10.4883

Forward selection

mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic_new)

mod_forward <- step(mod_null,
                    scope = list(lower = mod_null, upper = model_maxA),
                    direction = "forward", trace = 1)
## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + bmi            1      6253 592434 34442
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1   17940.1 416725 31632
## + exercise       1    6466.7 428198 31849
## + depression     1    6070.1 428595 31857
## + income         3    3231.0 431434 31913
## + smoking        2    1776.3 432889 31938
## + age_group      3    1535.7 433129 31945
## + sex            1    1254.2 433411 31946
## + marital        2     830.3 433835 31956
## + education      3     415.5 434250 31965
## <none>                       434665 31967
## + bmi            1       0.2 434665 31969
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##              Df Sum of Sq    RSS   AIC
## + exercise    1    5820.8 410904 31521
## + age_group   3    4661.5 412064 31548
## + income      3    1987.0 414738 31600
## + marital     2    1402.5 415322 31609
## + smoking     2    1034.6 415690 31616
## + depression  1     738.1 415987 31620
## + sex         1     605.2 416120 31622
## + education   3     318.9 416406 31632
## <none>                    416725 31632
## + bmi         1       3.6 416721 31634
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##              Df Sum of Sq    RSS   AIC
## + age_group   3    3720.5 407184 31455
## + income      3    1198.4 409706 31504
## + marital     2    1064.8 409839 31505
## + depression  1     739.1 410165 31509
## + smoking     2     706.2 410198 31512
## + education   3     686.9 410217 31514
## + sex         1     449.3 410455 31515
## <none>                    410904 31521
## + bmi         1      82.9 410821 31522
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##              Df Sum of Sq    RSS   AIC
## + income      3   1162.69 406021 31438
## + depression  1    932.70 406251 31438
## + education   3    501.69 406682 31451
## + sex         1    261.03 406923 31452
## + smoking     2    360.51 406823 31452
## <none>                    407184 31455
## + bmi         1     83.59 407100 31455
## + marital     2     40.11 407144 31458
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##              Df Sum of Sq    RSS   AIC
## + depression  1    861.57 405159 31423
## + education   3    953.62 405067 31425
## + smoking     2    305.02 405716 31436
## + sex         1    203.01 405818 31436
## <none>                    406021 31438
## + bmi         1     74.91 405946 31438
## + marital     2    153.70 405867 31439
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##             Df Sum of Sq    RSS   AIC
## + education  3    837.21 404322 31412
## + smoking    2    259.11 404900 31422
## + sex        1    110.16 405049 31422
## + bmi        1    105.12 405054 31423
## <none>                   405159 31423
## + marital    2    169.72 404990 31423
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##           Df Sum of Sq    RSS   AIC
## + smoking  2    332.18 403990 31410
## + bmi      1    110.19 404212 31412
## <none>                 404322 31412
## + sex      1     77.87 404244 31413
## + marital  2    168.26 404154 31413
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##           Df Sum of Sq    RSS   AIC
## + sex      1    102.92 403887 31410
## <none>                 403990 31410
## + bmi      1     98.80 403891 31410
## + marital  2    169.54 403820 31410
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##           Df Sum of Sq    RSS   AIC
## <none>                 403887 31410
## + bmi      1    97.861 403789 31410
## + marital  2   170.601 403716 31410
tidy(mod_forward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Forward Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Forward Selection Result (AIC-based)
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 1.8629 0.5225 3.5653 0.0004 0.8387 2.8871
gen_healthGood 1.3636 0.1839 7.4161 0.0000 1.0032 1.7241
gen_healthFair/Poor 10.0009 0.2486 40.2245 0.0000 9.5136 10.4883
menthlth_days 0.1836 0.0114 16.1748 0.0000 0.1614 0.2059
exerciseYes -1.9048 0.2031 -9.3793 0.0000 -2.3029 -1.5067
age_group35-49 0.8374 0.2703 3.0980 0.0020 0.3075 1.3672
age_group50-64 1.4778 0.2583 5.7206 0.0000 0.9714 1.9841
age_group65+ 1.8366 0.2513 7.3099 0.0000 1.3441 2.3292
income$25-$49 -1.0324 0.2749 -3.7557 0.0002 -1.5713 -0.4936
income$50-$99 -1.3884 0.2658 -5.2239 0.0000 -1.9094 -0.8674
income$100k+ -1.1122 0.3849 -2.8897 0.0039 -1.8666 -0.3577
depressionYes 0.7471 0.2149 3.4769 0.0005 0.3259 1.1683
educationHS/GED 0.2576 0.4006 0.6431 0.5202 -0.5276 1.0428
educationSome college 0.9636 0.4025 2.3943 0.0167 0.1747 1.7525
educationCollege graduate 1.0310 0.4043 2.5502 0.0108 0.2385 1.8236
smokingFormer smoker -0.0377 0.2831 -0.1332 0.8940 -0.5926 0.5172
smokingNever smoker -0.4776 0.2647 -1.8038 0.0713 -0.9965 0.0414
sexFemale 0.2329 0.1633 1.4262 0.1539 -0.0872 0.5530

Stepwise selection

mod_stepwise <- step(mod_null,
                     scope = list(lower = mod_null, upper = model_maxA),
                     direction = "both", trace = 1)
## Start:  AIC=34524.38
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    164022 434665 31967
## + menthlth_days  1     60303 538384 33677
## + exercise       1     34542 564145 34051
## + income         3     28842 569845 34135
## + depression     1     20750 577937 34244
## + smoking        2     12790 585897 34356
## + education      3      8593 590094 34415
## + marital        2      7321 591366 34430
## + bmi            1      6253 592434 34442
## + age_group      3      6527 592160 34443
## + sex            1      1649 597038 34504
## <none>                       598687 34524
## 
## Step:  AIC=31967.07
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     17940 416725 31632
## + exercise       1      6467 428198 31849
## + depression     1      6070 428595 31857
## + income         3      3231 431434 31913
## + smoking        2      1776 432889 31938
## + age_group      3      1536 433129 31945
## + sex            1      1254 433411 31946
## + marital        2       830 433835 31956
## + education      3       416 434250 31965
## <none>                       434665 31967
## + bmi            1         0 434665 31969
## - gen_health     2    164022 598687 34524
## 
## Step:  AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
## 
##                 Df Sum of Sq    RSS   AIC
## + exercise       1      5821 410904 31521
## + age_group      3      4661 412064 31548
## + income         3      1987 414738 31600
## + marital        2      1402 415322 31609
## + smoking        2      1035 415690 31616
## + depression     1       738 415987 31620
## + sex            1       605 416120 31622
## + education      3       319 416406 31632
## <none>                       416725 31632
## + bmi            1         4 416721 31634
## - menthlth_days  1     17940 434665 31967
## - gen_health     2    121660 538384 33677
## 
## Step:  AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## + age_group      3      3720 407184 31455
## + income         3      1198 409706 31504
## + marital        2      1065 409839 31505
## + depression     1       739 410165 31509
## + smoking        2       706 410198 31512
## + education      3       687 410217 31514
## + sex            1       449 410455 31515
## <none>                       410904 31521
## + bmi            1        83 410821 31522
## - exercise       1      5821 416725 31632
## - menthlth_days  1     17294 428198 31849
## - gen_health     2    101805 512709 33288
## 
## Step:  AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##                 Df Sum of Sq    RSS   AIC
## + income         3      1163 406021 31438
## + depression     1       933 406251 31438
## + education      3       502 406682 31451
## + sex            1       261 406923 31451
## + smoking        2       361 406823 31451
## <none>                       407184 31455
## + bmi            1        84 407100 31455
## + marital        2        40 407144 31458
## - age_group      3      3720 410904 31521
## - exercise       1      4880 412064 31548
## - menthlth_days  1     19957 427141 31835
## - gen_health     2     94875 502059 33126
## 
## Step:  AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##                 Df Sum of Sq    RSS   AIC
## + depression     1       862 405159 31423
## + education      3       954 405067 31425
## + smoking        2       305 405716 31436
## + sex            1       203 405818 31436
## <none>                       406021 31438
## + bmi            1        75 405946 31438
## + marital        2       154 405867 31439
## - income         3      1163 407184 31455
## - age_group      3      3685 409706 31504
## - exercise       1      4214 410235 31518
## - menthlth_days  1     18945 424966 31801
## - gen_health     2     87509 493530 32995
## 
## Step:  AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression
## 
##                 Df Sum of Sq    RSS   AIC
## + education      3       837 404322 31412
## + smoking        2       259 404900 31422
## + sex            1       110 405049 31423
## + bmi            1       105 405054 31423
## <none>                       405159 31423
## + marital        2       170 404990 31423
## - depression     1       862 406021 31438
## - income         3      1092 406251 31438
## - age_group      3      3871 409030 31493
## - exercise       1      4219 409378 31504
## - menthlth_days  1     13674 418834 31686
## - gen_health     2     86610 491770 32969
## 
## Step:  AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education
## 
##                 Df Sum of Sq    RSS   AIC
## + smoking        2       332 403990 31410
## + bmi            1       110 404212 31412
## <none>                       404322 31412
## + sex            1        78 404244 31413
## + marital        2       168 404154 31413
## - education      3       837 405159 31423
## - depression     1       745 405067 31425
## - income         3      1495 405818 31436
## - age_group      3      3558 407880 31476
## - exercise       1      4604 408926 31501
## - menthlth_days  1     13607 417929 31675
## - gen_health     2     86813 491135 32964
## 
## Step:  AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking
## 
##                 Df Sum of Sq    RSS   AIC
## + sex            1       103 403887 31410
## <none>                       403990 31410
## + bmi            1        99 403891 31410
## + marital        2       170 403820 31410
## - smoking        2       332 404322 31412
## - depression     1       691 404681 31421
## - education      3       910 404900 31422
## - income         3      1457 405447 31432
## - age_group      3      3178 407168 31466
## - exercise       1      4502 408492 31496
## - menthlth_days  1     13347 417337 31668
## - gen_health     2     85552 489542 32942
## 
## Step:  AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       403887 31410
## - sex            1       103 403990 31410
## + bmi            1        98 403789 31410
## + marital        2       171 403716 31410
## - smoking        2       357 404244 31413
## - depression     1       612 404499 31420
## - education      3       875 404762 31421
## - income         3      1394 405281 31431
## - age_group      3      3049 406936 31464
## - exercise       1      4451 408338 31495
## - menthlth_days  1     13238 417125 31666
## - gen_health     2     85645 489532 32944
method_comparison <- tribble(
  ~Method, ~`Variables selected`, ~`Adj. R²`, ~AIC, ~BIC,
  "Maximum model",
    length(coef(model_maxA)) - 1,
    round(glance(model_maxA)$adj.r.squared, 4),
    round(AIC(model_maxA), 1),
    round(BIC(model_maxA), 1),
  "Backward (AIC)",
    length(coef(mod_backward)) - 1,
    round(glance(mod_backward)$adj.r.squared, 4),
    round(AIC(mod_backward), 1),
    round(BIC(mod_backward), 1),
  "Forward (AIC)",
    length(coef(mod_forward)) - 1,
    round(glance(mod_forward)$adj.r.squared, 4),
    round(AIC(mod_forward), 1),
    round(BIC(mod_forward), 1),
  "Stepwise (AIC)",
    length(coef(mod_stepwise)) - 1,
    round(glance(mod_stepwise)$adj.r.squared, 4),
    round(AIC(mod_stepwise), 1),
    round(BIC(mod_stepwise), 1)
)

method_comparison |>
  kable(caption = "Comparison of Variable Selection Methods") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Comparison of Variable Selection Methods
Method Variables selected Adj. R² AIC BIC
Maximum model 20 0.3242 54115.1 54268.8
Backward (AIC) 17 0.3239 54114.6 54247.3
Forward (AIC) 17 0.3239 54114.6 54247.3
Stepwise (AIC) 17 0.3239 54114.6 54247.3

In 3-5 sentences, compare the results: • Do the three methods agree on the same final model? • Which variables (if any) were excluded by all three methods? • Which variables were retained by all three methods

When using the backward elimination, forward and stepwise, all three methods selected the same model with 17 predictor terms (Adjusted R^2 = 0.024, AIC = 54114.6, BIC = 54247.3). Based on the model, each excluded specific variables but not on all three methods. Variables such as smoking, exercise and mentally unhealthy days were retained by all three methods.

Step 4: Final Model Selection and Interpretation

Choose your final model. State which method(s) guided your choice and why. Display the coefficient table for your final model using tidy() with confidence intervals

My final model is the maximum model in order to see the outcome of physically unhealthy days and its correlation to the predictors used.

mod_max <- lm(physhlth_days ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + income + gen_health + marital,
              data = brfss_analytic_new)

tidy(mod_max, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Maximum Model: All Candidate Predictors",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Maximum Model: All Candidate Predictors
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 2.6655 0.6451 4.1321 0.0000 1.4010 3.9300
menthlth_days 0.1836 0.0114 16.1723 0.0000 0.1614 0.2059
age_group35-49 0.7693 0.2836 2.7123 0.0067 0.2133 1.3254
age_group50-64 1.4011 0.2787 5.0264 0.0000 0.8547 1.9475
age_group65+ 1.7272 0.2785 6.2016 0.0000 1.1813 2.2732
sexFemale 0.2345 0.1639 1.4308 0.1525 -0.0868 0.5558
educationHS/GED 0.2937 0.4009 0.7326 0.4638 -0.4921 1.0796
educationSome college 1.0020 0.4027 2.4880 0.0129 0.2125 1.7914
educationCollege graduate 1.0526 0.4044 2.6026 0.0093 0.2598 1.8454
depressionYes 0.7754 0.2153 3.6016 0.0003 0.3533 1.1974
exerciseYes -1.9329 0.2041 -9.4711 0.0000 -2.3329 -1.5328
smokingFormer smoker -0.0264 0.2845 -0.0927 0.9261 -0.5840 0.5312
smokingNever smoker -0.4641 0.2662 -1.7438 0.0812 -0.9859 0.0576
bmi -0.0188 0.0129 -1.4589 0.1446 -0.0440 0.0065
income$25-$49 -1.0844 0.2763 -3.9251 0.0001 -1.6260 -0.5428
income$50-$99 -1.5238 0.2761 -5.5199 0.0000 -2.0650 -0.9827
income$100k+ -1.3158 0.3982 -3.3043 0.0010 -2.0964 -0.5352
gen_healthGood 1.4098 0.1863 7.5655 0.0000 1.0445 1.7751
gen_healthFair/Poor 10.0651 0.2524 39.8790 0.0000 9.5703 10.5598
maritalPreviously married -0.2670 0.2068 -1.2912 0.1967 -0.6724 0.1384
maritalNever married -0.4124 0.2490 -1.6565 0.0977 -0.9005 0.0756

Interpret the coefficients for at least three predictors in plain language, including units and “holding all other variables constant” language. Include at least one continuous and one categorical predictor.

Menthlth_days(continuous predictor) = each additional day of mentally unhealthy days is associated with an estimated 0.184 additional physically unhealthy days on average, holding age, sex, education, depression, smoking, BMI, income, gen health status and marital status constant.

Depression (continuous predictor) = each one-unit increase in the depression is associated with 0.775 fewer physically unhealthy days on average, holding all other variables constant.

Sex (categorical predictor) = Female vs Male = compared to males (reference group), females report an estimated 0.235 more physically unhealthy days on average, holding all other variables constant.

Step 5: LINE Assumption Check

par(mfrow = c(2, 2))
plot(model_maxA)

par(mfrow = c(1, 1))
  1. Residuals vs. Fitted: Is there evidence of non-linearity or non-constant variance?

The plot shows there is evidence of non-linearity variance since the plots show a random scatter around 0 and there is no specific pattern.

  1. Normal Q-Q: Do the residuals approximately follow a normal distribution? Where do they deviate?

The residuals do follow a normal distribution until the value is 2, then it starts to deviate in an increase direction.

  1. Scale-Location: Is the spread of residuals roughly constant across fitted values?

Yes, the spread of residuals roughly constant across the fitted values. As shows, many of the dots follow the flat red line and is a constant spread.

  1. Residuals vs. Leverage: Are there any highly influential observations (large Cook’s distance)?

Yes, there are highly influential observations as the x-axis increases. There are observations at 0.008, 0.010 and 0.014.

Write a brief conclusion (2-3 sentences): Are the LINE assumptions reasonably satisfied? What violations, if any, do you observe?

Given the line assumptions represented above, they are all reasonably stratified. Possibly violations that could occur include normality depending on the size of the sample, which can lead to biases and homoscedasticity.

#Part 2: Logistic Regression

You will now reframe the research question using a binary outcome. The CDC defines frequent physical distress as reporting 14 or more physically unhealthy days in the past 30 days.

Step 1: Create the Binary Outcome

brfss_analytic_new <- brfss_analytic_new %>%
mutate(
  frequent_distress = case_when(
    physhlth_days >= 14 ~ "Yes",
    physhlth_days < 14 ~ "No",
    TRUE ~ NA_character_
  ),
  frequent_distress = factor(frequent_distress,
        levels = c ("No", "Yes"))
  
)

tab <- table(brfss_analytic_new$frequent_distress)

freq_table <- data.frame(
  "Frequent Distress" =  names(tab),
  N = as.vector(tab),
  Percent = paste0(round(100 * prop.table(tab), 1), "%")
)

knitr::kable(
  freq_table,
  col.names = c("Frequent Distress", "N", "%"),
  caption = "Outcome Distribution")
Outcome Distribution
Frequent Distress N %
No 6948 86.9%
Yes 1052 13.2%

In 1-2 sentences, comment on the balance of the outcome

When looking at frequent distress, the individuals who reported not being distressed were significantly higher compared to the individuals who did report distress. This means that the individuals who did not experience distress had few physically unhealthy days.

Step 2: Simple (Unadjusted) Logistic Regression

Choose one predictor from your final linear model that you expect to have a strong association with physical distress. State why you chose it

I chose the predictor BMI from my final linear model because the higher an individuals BMI is, the higher the chance of them experiencing physical distress. This can be due to physical health barriers and emotional challenges.

mod_simple <- glm(frequent_distress ~ bmi,
data = brfss_analytic_new, family = binomial)

summary(mod_simple)
## 
## Call:
## glm(formula = frequent_distress ~ bmi, family = binomial, data = brfss_analytic_new)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.993253   0.142112 -21.063  < 2e-16 ***
## bmi          0.037759   0.004624   8.165 3.21e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 6164.3  on 7998  degrees of freedom
## AIC: 6168.3
## 
## Number of Fisher Scoring iterations: 4
mod_simple <- glm(frequent_distress ~ bmi, data = brfss_analytic_new,
               family = binomial(link = "logit"))

tidy(mod_simple, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Simple Logistic Regression: Frequent Distress ~ BMI (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Simple Logistic Regression: Frequent Distress ~ BMI (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.050 0.142 -21.063 0 0.038 0.066
bmi 1.038 0.005 8.165 0 1.029 1.048
exp(coef(mod_simple)) # Odds ratio
## (Intercept)         bmi 
##   0.0501241   1.0384807
exp(confint(mod_simple)) # 95% CI for OR
##                  2.5 %     97.5 %
## (Intercept) 0.03792694 0.06621861
## bmi         1.02906666 1.04790124

Interpret the odds ratio in plain language.

For every one-unit increase in BMI, the odds of frequent physical distress are multiplied by 1.038, 95% CI [1.029, 1.048].

State whether the association is statistically significant at alpha = 0.05 and how you know (Wald test p-value or CI excluding 1.0)

Since the p-value = 0, this means it is less than 0.05. Therefore, it is statistically significant and there is a positive correlation between BMI and physical distress.

Step 3: Multiple Logistic Regression

mod_logistic <- glm(frequent_distress ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + income + gen_health + marital,
data = brfss_analytic_new, family = binomial)

summary(mod_logistic)
## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + age_group + 
##     sex + education + depression + exercise + smoking + bmi + 
##     income + gen_health + marital, family = binomial, data = brfss_analytic_new)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.431364   0.297352 -11.540  < 2e-16 ***
## menthlth_days              0.044787   0.004358  10.277  < 2e-16 ***
## age_group35-49             0.512672   0.162401   3.157  0.00160 ** 
## age_group50-64             0.767269   0.154816   4.956 7.20e-07 ***
## age_group65+               0.964882   0.155220   6.216 5.09e-10 ***
## sexFemale                  0.176389   0.081382   2.167  0.03020 *  
## educationHS/GED            0.049334   0.165069   0.299  0.76504    
## educationSome college      0.331979   0.165058   2.011  0.04430 *  
## educationCollege graduate  0.283640   0.171335   1.655  0.09783 .  
## depressionYes              0.267888   0.095954   2.792  0.00524 ** 
## exerciseYes               -0.586264   0.085768  -6.835 8.17e-12 ***
## smokingFormer smoker      -0.007660   0.121363  -0.063  0.94967    
## smokingNever smoker       -0.208048   0.116374  -1.788  0.07381 .  
## bmi                       -0.004822   0.005556  -0.868  0.38546    
## income$25-$49             -0.215375   0.111712  -1.928  0.05386 .  
## income$50-$99             -0.501752   0.118036  -4.251 2.13e-05 ***
## income$100k+              -0.541456   0.227454  -2.381  0.01729 *  
## gen_healthGood             0.895044   0.116270   7.698 1.38e-14 ***
## gen_healthFair/Poor        2.696530   0.115309  23.385  < 2e-16 ***
## maritalPreviously married -0.138611   0.096205  -1.441  0.14965    
## maritalNever married      -0.143542   0.127789  -1.123  0.26132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4420.7  on 7979  degrees of freedom
## AIC: 4462.7
## 
## Number of Fisher Scoring iterations: 6
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.032 0.297 -11.540 0.000 0.018 0.058
menthlth_days 1.046 0.004 10.277 0.000 1.037 1.055
age_group35-49 1.670 0.162 3.157 0.002 1.217 2.302
age_group50-64 2.154 0.155 4.956 0.000 1.596 2.929
age_group65+ 2.624 0.155 6.216 0.000 1.944 3.573
sexFemale 1.193 0.081 2.167 0.030 1.017 1.400
educationHS/GED 1.051 0.165 0.299 0.765 0.762 1.456
educationSome college 1.394 0.165 2.011 0.044 1.011 1.932
educationCollege graduate 1.328 0.171 1.655 0.098 0.952 1.864
depressionYes 1.307 0.096 2.792 0.005 1.082 1.576
exerciseYes 0.556 0.086 -6.835 0.000 0.470 0.658
smokingFormer smoker 0.992 0.121 -0.063 0.950 0.783 1.260
smokingNever smoker 0.812 0.116 -1.788 0.074 0.647 1.022
bmi 0.995 0.006 -0.868 0.385 0.984 1.006
income$25-$49 0.806 0.112 -1.928 0.054 0.648 1.004
income$50-$99 0.605 0.118 -4.251 0.000 0.481 0.763
income$100k+ 0.582 0.227 -2.381 0.017 0.368 0.899
gen_healthGood 2.447 0.116 7.698 0.000 1.952 3.081
gen_healthFair/Poor 14.828 0.115 23.385 0.000 11.863 18.648
maritalPreviously married 0.871 0.096 -1.441 0.150 0.720 1.050
maritalNever married 0.866 0.128 -1.123 0.261 0.673 1.111

Interpret the adjusted odds ratios for at least three predictors in plain language, using “holding all other variables constant” language. Include at least one continuous and one categorical predictor.

EducationHS/GED (continuous predictor) = each additional day of educationHS/GED days is associated with an estimated 1.051 additional physically unhealthy days on average, holding age, sex, mentally unhealthy days,depression, smoking, BMI, income, gen health status and marital status constant.

Exercise(continuous predictor) = each one-unit increase in the exercise is associated with 0.556 fewer physically unhealthy days on average, holding all other variables constant.

Sex (categorical predictor) = Female vs Male = compared to males (reference group), females report an estimated 1.193 more physically unhealthy days on average, holding all other variables constant.

Step 4: Likelihood Ratio Test

Group: All education levels

mod_reduced_education <- glm(frequent_distress ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + gen_health + marital, data = 
                  brfss_analytic_new, family = binomial)

# Likelihood ratio test
lr_test <- anova(mod_reduced_education, mod_logistic, test = "Chisq")

lr_test |>
  kable(digits = 3, caption = "Likelihood Ratio Test: Full Model vs. Null") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Likelihood Ratio Test: Full Model vs. Null
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
7982 4439.742 NA NA NA
7979 4420.659 3 19.082 0

• State the null and alternative hypotheses

H0: Group of predictors is associated with physical unhealthy days H1: Group of predictors is not associated with physical unhealthy days

• Report: test statistic (deviance) = 19.082 degrees of freedom = 3 p-value = 0

• Conclude at alpha = 0.05: Does this group of predictors significantly improve the model?

Since the p-value is less than 0.05, this group of predictors are significantly improve the model for the outcome of physically unhealthy days.

Step 5: ROC Curve and AUC

roc_obj <- roc(brfss_analytic_new$frequent_distress, fitted(mod_logistic))
plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE)

auc(roc_obj)
## Area under the curve: 0.8555
ci.auc(roc_obj)
## 95% CI: 0.8426-0.8683 (DeLong)

Interpret the AUC: What does this value tell you about the model’s ability to discriminate between individuals with and without frequent physical distress? Use the following guide:

o 0.5 = no discrimination (coin flip) o 0.5-0.7 = poor discrimination o 0.7-0.8 = acceptable discrimination o 0.8-0.9 = excellent discrimination o 0.9+ = outstanding discrimination

The model tells me that since the AUC = 0.8555, there is excellent discrimination between individuals with and without frequent physical distress.

Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.5773, df = 8, p-value = 0.6945

• State the null hypothesis (the model fits the data well) and alternative hypothesis (the model does not fit the data well)

H0: The model fits the data for physically unhealthy days H1: The model does not fit the data for physically unhealthy days

• Report: test statistic = 5.5773 degrees of freedom = 8 p-value = 0.6945

• Interpret: At alpha = 0.05, is there evidence of poor model fit?

Since the p-value is greater than 0.05, the model does fit in some regions of predicted probability.

• In 1-2 sentences, discuss how the Hosmer-Lemeshow result complements the ROC/AUC finding.

The Hosmer-Lemeshow result complements the ROC/AUC finding by assessing the agreement between the predicted and observed rates within the predicted probability. With this data, it can also help determine the discrimination and the curve of the plot.

Part 3: Reflection

A. Statistical Insights (5 points): What do your results suggest about the factors associated with physical health burden among U.S. adults? Which predictors were the strongest in both the linear and logistic models? Were any predictors significant in one model but not the other? What are the key limitations of using cross-sectional survey data for this type of analysis? Name at least two specific potential confounders not measured in your model.

I would suggest that there are certain factors that contribute to the physical health burden among U.S. adults. For example, mentally unhealthy days, individuals who are 35-49 years old and people who have a poor general health status are at a higher chance of developing physically unhealthy days. The strongest predictors in the linear models was BMI, mentally unhealthy days, individuals ages 35-65+. The predictors remained significant in the models, but it varied on the p-value. Limitations using a cross-sectional survey for this type of analysis is that it might be more challenging to identify the temporal relationship based on the physical unhealthy days when considering other factors such as environmental contributors. Biases can also be a limitation depending on the sample size. One potential confounder not measured in the model is alcohol consumption. This can have an impact on depression, exercise, BMI, mentally unhealthy days and general health status. Another potential confounder could be ethnicity. Ethnicity can play a crucial role in the development of physically unhealthy days. Based on anindividual’s ethnicity, this can lead to factors such as lack of resources and services available to achieve healthy physical days.

B. Linear versus Logistic Regression (5 points): Compare what the linear regression model tells you versus what the logistic regression model tells you about the same research question. What information does each approach provide that the other does not? In what situations would you prefer one approach over the other? How do the model selection criteria differ between the two frameworks (for example, R-squared versus AUC)?

When comparing the linear regression model from the logistic regression model, the linear model is able to analyze and identify the significance of one predictor while the logistic regression takes into account multiple factors for the outcome of physically unhealthy days among U.S. adults. In the models, we see that linear shows how continuous predictors have an impact on the outcome while logistic uses binary outcomes. The linear regression allows us to predict the values of physically unhealthy days while also testing the hypotheses. The logistic regression allows us to analyze the adjusted odds ratios in order to control for potential confounders. I would prefer the logistic regression if I am using certain predictors and outcome variables that are both categorical. I would be able to determine the probability of an event occurring based on the correlation between predictors. The model selection criteria differ between two frameworks because each has its own methods, limitations and violations based on the model used. When using R-squared, information will be coded and analyzed differently compared to when using the AUC plot.

C. AI Transparency (5 points): Describe any parts of this assignment where you sought AI assistance. Include the specific prompts you used and how you verified the correctness of the output. If you did not use AI, state this explicitly.

I had used AI when it came to minor grammatical errors such as using too many commas, brackets or leaving certain syntax out. For creating the binary outcome, I had a hard time trying to initially write the code. I had referenced the lecture material but AI recommended how I should format the beginning. It also recommended to use names(tab) instead of levels. I used CHAT in multiple logistic regression because I was having problems why frequent distress variable was not in my data set. CHAT recommended I had to go back a few steps to recheck my codes. For the likelihood ratio test, I had trouble writing my mod_reduced section, I used reference from the lecture materials but I needed to change the formatting in my “glm” since I used all education levels.