Part 0: Data Preparation

# Load Packages

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
## Warning: package 'haven' was built under R version 4.5.2
library(broom)
## Warning: package 'broom' was built under R version 4.5.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.2
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(leaps)
## Warning: package 'leaps' was built under R version 4.5.2
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.5.3
## ResourceSelection 0.3-6   2023-06-27
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.2
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# Load BRFSS 2023 Data
brfss_full <- read_xpt(
  "C:/Users/abbym/OneDrive/Desktop/STATS553/Assignments/Homework #4/LLCP2023.XPT"
) |>
  clean_names()
brfss_clean <- brfss_full |>

# Recode variables
  mutate(
    # Outcome: mentally unhealthy days in past 30 (88 = none = 0; 77/99 = DK/refused = NA)
    menthlth_days = case_when(
      menthlth == 88              ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                        ~ NA_real_
    ),
    # Physical health days (key predictor)
    physhlth_days = case_when(
      physhlth == 88              ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                        ~ NA_real_
    ),
    # Age category
    age_group = factor(case_when(
      ageg5yr == 1 ~ "18-34",
      ageg5yr == 2 ~ "18-34",
      ageg5yr == 3 ~ "18-34",
      ageg5yr == 4 ~ "35-49",
      ageg5yr == 5 ~ "35-49",
      ageg5yr == 6 ~ "35-49",
      ageg5yr == 7 ~ "50-64",
      ageg5yr == 8 ~ "50-64",
      ageg5yr == 9 ~ "50-64",
      ageg5yr == 10 ~ "65+",
      ageg5yr == 11 ~ "65+",
      ageg5yr == 12 ~ "65+",
      ageg5yr == 13 ~ "65+",
      ageg5yr == 14 ~ NA
    )), 
    # Income
    income = factor(case_when(
      incomg1 == 1 ~ "Less than $25K",
      incomg1 == 2 ~ "Less than $25K",
      incomg1 == 3 ~ "$25K-$49K",
      incomg1 == 4 ~ "$25K-$49K",
      incomg1 == 5 ~ "$50K-$99K",
      incomg1 == 6 ~ "$50K-$99K",
      incomg1 == 7 ~ "$100K+",
      incomg1 == 9 ~ NA
    )), 
    #Relevel Income
    income = relevel(income, ref= "Less than $25K"),
    
    # Education
    education = factor(case_when(
      educa == 1 ~ "Less than high school", 
      educa == 2 ~ "Less than high school",
      educa == 3 ~ "Less than high school",
      educa == 4 ~ "HS/GED",
      educa == 5 ~ "Some College",
      educa == 6 ~ "College Graduate",
      educa == 9 ~ NA
    )), 
    # Relevel education
    education = relevel(education, ref= "Less than high school"),
    
    # Smoker
    smoking = factor(case_when(
      smoker3 == 1 ~ "Current",
      smoker3 == 2 ~ "Current",
      smoker3 == 3 ~ "Former",
      smoker3 == 4 ~ "Never",
      smoker3 == 9 ~ NA
    )),
    # Relevel smoking
    smoking = relevel(smoking, ref= "Never"),
    
    # Sex
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    # Exercise in past 30 days (any physical activity)
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    # BMI (stored as integer × 100 in BRFSS)
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_)
  ) |>
  
  # Select Variables
 dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, age_group, income, education, smoking)
brfss_clean|>
  select(physhlth_days, menthlth_days, bmi, sex, exercise, age_group, income, education, smoking) |>
  summarise(
    across(
      everything(),
      list(
        n_missing = ~ sum(is.na(.) | . %in% c(77, 88, 99, 7777, 9999)),
        pct_missing = ~ sum(is.na(.) | . %in% c(77, 88, 99, 7777, 9999)) / n() * 100
      ),
      .names = "{.col}_{.fn}"
    )
  ) |>
  pivot_longer(
    everything(),
    names_to = c("Variable", ".value"),
    names_pattern = "(.*)_(n_missing|pct_missing)"
  ) |>
  mutate(pct_missing = round(pct_missing, 1)) |>
  arrange(desc(pct_missing)) |>
  kable(
    caption = "Missing or Don't Know/Refused (%) — BRFSS full sample",
    col.names = c("Variable", "N Missing / DK / Refused", "% Missing / DK / Refused")
  ) |>
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE
  )
Missing or Don’t Know/Refused (%) — BRFSS full sample
Variable N Missing / DK / Refused % Missing / DK / Refused
income 86623 20.0
bmi 40535 9.4
smoking 23062 5.3
physhlth_days 10785 2.5
menthlth_days 8108 1.9
age_group 7779 1.8
education 2325 0.5
exercise 1251 0.3
sex 0 0.0
set.seed(1220)
brfss_analytic <- brfss_clean |>
 drop_na() |>
 slice_sample(n = 8000)

# Report analytic sample size 

nrow(brfss_analytic)
## [1] 8000
brfss_analytic |>
  tbl_summary(
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi     ~ "BMI (kg/m^2)",
      age_group           ~ "Age Group (years)",
      income      ~ "Household income",
      exercise      ~ "Any physical activity (past 30 days)",
      education ~ "Education Level",
      smoking ~ "Smoking status",
      sex ~ "Sex"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  # add_overall() |>
  bold_labels() |>
  #italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**")
Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)
Characteristic N N = 8,0001
Physically unhealthy days (past 30) 8,000 4.4 (8.7)
Mentally unhealthy days (past 30) 8,000 4.3 (8.1)
BMI (kg/m^2) 8,000 28.7 (6.5)
Sex 8,000
    Male
3,936 (49%)
    Female
4,064 (51%)
Any physical activity (past 30 days) 8,000 6,240 (78%)
Age Group (years) 8,000
    18-34
1,277 (16%)
    35-49
1,665 (21%)
    50-64
2,055 (26%)
    65+
3,003 (38%)
Household income 8,000
    Less than $25K
1,048 (13%)
    $100K+
638 (8.0%)
    $25K-$49K
1,938 (24%)
    $50K-$99K
4,376 (55%)
Education Level 8,000
    Less than high school
376 (4.7%)
    College Graduate
3,659 (46%)
    HS/GED
1,827 (23%)
    Some College
2,138 (27%)
Smoking status 8,000
    Never
4,812 (60%)
    Current
926 (12%)
    Former
2,262 (28%)
1 Mean (SD); n (%)
saveRDS(brfss_analytic,
 "C:/Users/abbym/OneDrive/Desktop/STATS553/Assignments/Homework #4/brfss_analytic.rds")

Part 1: Model Selection for Linear Regression

# Fit maximum model
mod_max <- lm(physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)

# Display summary
summary(mod_max)
## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.6671  -3.6770  -1.9783   0.6429  31.1695 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                5.78494    0.65888   8.780  < 2e-16 ***
## menthlth_days              0.30399    0.01143  26.586  < 2e-16 ***
## bmi                        0.04807    0.01399   3.436 0.000593 ***
## sexFemale                  0.03086    0.17943   0.172 0.863456    
## exerciseYes               -3.44946    0.22542 -15.302  < 2e-16 ***
## age_group35-49             1.34457    0.29864   4.502 6.82e-06 ***
## age_group50-64             2.64207    0.28757   9.188  < 2e-16 ***
## age_group65+               2.87902    0.27487  10.474  < 2e-16 ***
## income$100K+              -3.81913    0.43275  -8.825  < 2e-16 ***
## income$25K-$49K           -2.62287    0.30737  -8.533  < 2e-16 ***
## income$50K-$99K           -3.23452    0.29614 -10.922  < 2e-16 ***
## educationCollege Graduate -1.23624    0.45384  -2.724 0.006464 ** 
## educationHS/GED           -1.19956    0.45099  -2.660 0.007834 ** 
## educationSome College     -0.72133    0.45299  -1.592 0.111339    
## smokingCurrent             1.15379    0.29713   3.883 0.000104 ***
## smokingFormer              0.67961    0.20590   3.301 0.000968 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.859 on 7984 degrees of freedom
## Multiple R-squared:  0.1844, Adjusted R-squared:  0.1829 
## F-statistic: 120.4 on 15 and 7984 DF,  p-value: < 2.2e-16
# Display AIC and BIC
AIC(mod_max)
## [1] 55707.42
BIC(mod_max)
## [1] 55826.2

Interpretation: The r-squared is 0.1849, meaning that the predictors in the model can explain 18.49% of the variance in physically unhealthy days. The adjusted r-squared is 0.1833, meaning that the predictors in the model can explain 18.33% of the variance in physically unhealthy days, while being penalized for the amount of predictors in the model. The AIC is 55704.55 and the BIC is 55830.32. The AIC and BIC are helpful for comparing simpler models to the maximum model.

# Prepare a model matrix (need numeric predictors for leaps)
# Use the formula interface approach
best_subsets <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + income + education + smoking,
  data = brfss_analytic,
  nvmax = 17,      # maximum number of variables to consider
  method = "exhaustive"
)

best_summary <- summary(best_subsets)
subset_metrics <- tibble(
  p = 1:length(best_summary$adjr2),
  `Adj. R²` = best_summary$adjr2,
  BIC = best_summary$bic,
  Cp = best_summary$cp
)

p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.max(best_summary$adjr2),
             linetype = "dashed", color = "tomato") +
  labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
  theme_minimal(base_size = 12)

p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.min(best_summary$bic),
             linetype = "dashed", color = "tomato") +
  labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
  theme_minimal(base_size = 12)

gridExtra::grid.arrange(p1, p2, ncol = 2)

Interpretation: A model size of 14 predictors maximizes the adjusted r-squared, and a model size of 11 predictors minimizes the BIC. The two criteria do not agree on a best model size. The BIC favors a simpler model.

# Automated backward elimination using AIC
mod_backward <- step(mod_max, direction = "backward", trace = 1)
## Start:  AIC=33002.4
## physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
## 
##                 Df Sum of Sq    RSS   AIC
## - sex            1         2 493117 33000
## <none>                       493115 33002
## - education      3       747 493862 33009
## - bmi            1       729 493844 33012
## - smoking        2      1293 494408 33019
## - income         3      7823 500938 33122
## - age_group      3      8240 501355 33129
## - exercise       1     14462 507577 33232
## - menthlth_days  1     43656 536771 33679
## 
## Step:  AIC=33000.43
## physhlth_days ~ menthlth_days + bmi + exercise + age_group + 
##     income + education + smoking
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       493117 33000
## - education      3       745 493862 33007
## - bmi            1       728 493845 33010
## - smoking        2      1295 494412 33017
## - income         3      7909 501026 33122
## - age_group      3      8302 501418 33128
## - exercise       1     14480 507597 33230
## - menthlth_days  1     44141 537257 33684
# Table
tidy(mod_backward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Backward Elimination Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Backward Elimination Result (AIC-based)
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 5.8000 0.6530 8.8821 0.0000 4.5200 7.0800
menthlth_days 0.3042 0.0114 26.7352 0.0000 0.2819 0.3265
bmi 0.0480 0.0140 3.4339 0.0006 0.0206 0.0754
exerciseYes -3.4505 0.2253 -15.3127 0.0000 -3.8922 -3.0088
age_group35-49 1.3477 0.2981 4.5215 0.0000 0.7634 1.9320
age_group50-64 2.6452 0.2870 9.2171 0.0000 2.0826 3.2077
age_group65+ 2.8828 0.2740 10.5221 0.0000 2.3457 3.4198
income$100K+ -3.8271 0.4303 -8.8949 0.0000 -4.6705 -2.9837
income$25K-$49K -2.6241 0.3073 -8.5398 0.0000 -3.2264 -2.0217
income$50K-$99K -3.2385 0.2952 -10.9701 0.0000 -3.8172 -2.6598
educationCollege Graduate -1.2319 0.4531 -2.7188 0.0066 -2.1201 -0.3437
educationHS/GED -1.1978 0.4508 -2.6567 0.0079 -2.0815 -0.3140
educationSome College -0.7178 0.4525 -1.5863 0.1127 -1.6048 0.1692
smokingCurrent 1.1503 0.2964 3.8807 0.0001 0.5692 1.7313
smokingFormer 0.6768 0.2052 3.2977 0.0010 0.2745 1.0791
# Automated forward selection using AIC
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)

mod_forward <- step(mod_null,
                    scope = list(lower = mod_null, upper = mod_max),
                    direction = "forward", trace = 1)
## Start:  AIC=34603.45
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     59071 545563 33783
## + exercise       1     37706 566927 34090
## + income         3     32443 572191 34168
## + smoking        2     12018 592616 34447
## + education      3     11873 592761 34451
## + age_group      3      5515 599119 34536
## + bmi            1      4732 599901 34543
## + sex            1      1559 603075 34585
## <none>                       604634 34603
## 
## Step:  AIC=33783.01
## physhlth_days ~ menthlth_days
## 
##             Df Sum of Sq    RSS   AIC
## + exercise   1   28346.8 517216 33358
## + income     3   19686.5 525876 33495
## + age_group  3   14455.2 531108 33574
## + education  3    8240.9 537322 33667
## + smoking    2    6070.4 539492 33697
## + bmi        1    2821.0 542742 33744
## + sex        1     266.4 545296 33781
## <none>                   545563 33783
## 
## Step:  AIC=33358.15
## physhlth_days ~ menthlth_days + exercise
## 
##             Df Sum of Sq    RSS   AIC
## + income     3   11764.1 505452 33180
## + age_group  3    9921.3 507295 33209
## + smoking    2    3952.9 513263 33301
## + education  3    3409.0 513807 33311
## + bmi        1     753.5 516463 33348
## <none>                   517216 33358
## + sex        1     106.7 517109 33359
## 
## Step:  AIC=33180.09
## physhlth_days ~ menthlth_days + exercise + income
## 
##             Df Sum of Sq    RSS   AIC
## + age_group  3    9400.2 496052 33036
## + smoking    2    2483.5 502968 33145
## + bmi        1     781.6 504670 33170
## + education  3     937.1 504515 33171
## <none>                   505452 33180
## + sex        1       6.7 505445 33182
## 
## Step:  AIC=33035.91
## physhlth_days ~ menthlth_days + exercise + income + age_group
## 
##             Df Sum of Sq    RSS   AIC
## + smoking    2   1427.13 494625 33017
## + education  3   1013.78 495038 33026
## + bmi        1    666.76 495385 33027
## <none>                   496052 33036
## + sex        1     14.93 496037 33038
## 
## Step:  AIC=33016.86
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking
## 
##             Df Sum of Sq    RSS   AIC
## + bmi        1    762.78 493862 33007
## + education  3    779.94 493845 33010
## <none>                   494625 33017
## + sex        1      0.00 494625 33019
## 
## Step:  AIC=33006.52
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi
## 
##             Df Sum of Sq    RSS   AIC
## + education  3    745.36 493117 33000
## <none>                   493862 33007
## + sex        1      0.35 493862 33009
## 
## Step:  AIC=33000.43
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi + education
## 
##        Df Sum of Sq    RSS   AIC
## <none>              493117 33000
## + sex   1    1.8268 493115 33002
# Table
tidy(mod_forward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Forward Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Forward Selection Result (AIC-based)
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 5.8000 0.6530 8.8821 0.0000 4.5200 7.0800
menthlth_days 0.3042 0.0114 26.7352 0.0000 0.2819 0.3265
exerciseYes -3.4505 0.2253 -15.3127 0.0000 -3.8922 -3.0088
income$100K+ -3.8271 0.4303 -8.8949 0.0000 -4.6705 -2.9837
income$25K-$49K -2.6241 0.3073 -8.5398 0.0000 -3.2264 -2.0217
income$50K-$99K -3.2385 0.2952 -10.9701 0.0000 -3.8172 -2.6598
age_group35-49 1.3477 0.2981 4.5215 0.0000 0.7634 1.9320
age_group50-64 2.6452 0.2870 9.2171 0.0000 2.0826 3.2077
age_group65+ 2.8828 0.2740 10.5221 0.0000 2.3457 3.4198
smokingCurrent 1.1503 0.2964 3.8807 0.0001 0.5692 1.7313
smokingFormer 0.6768 0.2052 3.2977 0.0010 0.2745 1.0791
bmi 0.0480 0.0140 3.4339 0.0006 0.0206 0.0754
educationCollege Graduate -1.2319 0.4531 -2.7188 0.0066 -2.1201 -0.3437
educationHS/GED -1.1978 0.4508 -2.6567 0.0079 -2.0815 -0.3140
educationSome College -0.7178 0.4525 -1.5863 0.1127 -1.6048 0.1692
mod_stepwise <- step(mod_null,
                     scope = list(lower = mod_null, upper = mod_max),
                     direction = "both", trace = 1)
## Start:  AIC=34603.45
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     59071 545563 33783
## + exercise       1     37706 566927 34090
## + income         3     32443 572191 34168
## + smoking        2     12018 592616 34447
## + education      3     11873 592761 34451
## + age_group      3      5515 599119 34536
## + bmi            1      4732 599901 34543
## + sex            1      1559 603075 34585
## <none>                       604634 34603
## 
## Step:  AIC=33783.01
## physhlth_days ~ menthlth_days
## 
##                 Df Sum of Sq    RSS   AIC
## + exercise       1     28347 517216 33358
## + income         3     19686 525876 33495
## + age_group      3     14455 531108 33574
## + education      3      8241 537322 33667
## + smoking        2      6070 539492 33697
## + bmi            1      2821 542742 33744
## + sex            1       266 545296 33781
## <none>                       545563 33783
## - menthlth_days  1     59071 604634 34603
## 
## Step:  AIC=33358.15
## physhlth_days ~ menthlth_days + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## + income         3     11764 505452 33180
## + age_group      3      9921 507295 33209
## + smoking        2      3953 513263 33301
## + education      3      3409 513807 33311
## + bmi            1       754 516463 33348
## <none>                       517216 33358
## + sex            1       107 517109 33359
## - exercise       1     28347 545563 33783
## - menthlth_days  1     49711 566927 34090
## 
## Step:  AIC=33180.09
## physhlth_days ~ menthlth_days + exercise + income
## 
##                 Df Sum of Sq    RSS   AIC
## + age_group      3      9400 496052 33036
## + smoking        2      2484 502968 33145
## + bmi            1       782 504670 33170
## + education      3       937 504515 33171
## <none>                       505452 33180
## + sex            1         7 505445 33182
## - income         3     11764 517216 33358
## - exercise       1     20424 525876 33495
## - menthlth_days  1     41532 546984 33810
## 
## Step:  AIC=33035.91
## physhlth_days ~ menthlth_days + exercise + income + age_group
## 
##                 Df Sum of Sq    RSS   AIC
## + smoking        2      1427 494625 33017
## + education      3      1014 495038 33026
## + bmi            1       667 495385 33027
## <none>                       496052 33036
## + sex            1        15 496037 33038
## - age_group      3      9400 505452 33180
## - income         3     11243 507295 33209
## - exercise       1     17199 513251 33307
## - menthlth_days  1     47177 543229 33761
## 
## Step:  AIC=33016.86
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking
## 
##                 Df Sum of Sq    RSS   AIC
## + bmi            1       763 493862 33007
## + education      3       780 493845 33010
## <none>                       494625 33017
## + sex            1         0 494625 33019
## - smoking        2      1427 496052 33036
## - age_group      3      8344 502968 33145
## - income         3      9897 504522 33169
## - exercise       1     16705 511329 33281
## - menthlth_days  1     44789 539414 33708
## 
## Step:  AIC=33006.52
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi
## 
##                 Df Sum of Sq    RSS   AIC
## + education      3       745 493117 33000
## <none>                       493862 33007
## + sex            1         0 493862 33009
## - bmi            1       763 494625 33017
## - smoking        2      1523 495385 33027
## - age_group      3      8245 502107 33133
## - income         3      9781 503643 33157
## - exercise       1     15215 509077 33247
## - menthlth_days  1     44206 538068 33690
## 
## Step:  AIC=33000.43
## physhlth_days ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi + education
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       493117 33000
## + sex            1         2 493115 33002
## - education      3       745 493862 33007
## - bmi            1       728 493845 33010
## - smoking        2      1295 494412 33017
## - income         3      7909 501026 33122
## - age_group      3      8302 501418 33128
## - exercise       1     14480 507597 33230
## - menthlth_days  1     44141 537257 33684
# Table 
tidy(mod_stepwise, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Stepwise Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Stepwise Selection Result (AIC-based)
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 5.8000 0.6530 8.8821 0.0000 4.5200 7.0800
menthlth_days 0.3042 0.0114 26.7352 0.0000 0.2819 0.3265
exerciseYes -3.4505 0.2253 -15.3127 0.0000 -3.8922 -3.0088
income$100K+ -3.8271 0.4303 -8.8949 0.0000 -4.6705 -2.9837
income$25K-$49K -2.6241 0.3073 -8.5398 0.0000 -3.2264 -2.0217
income$50K-$99K -3.2385 0.2952 -10.9701 0.0000 -3.8172 -2.6598
age_group35-49 1.3477 0.2981 4.5215 0.0000 0.7634 1.9320
age_group50-64 2.6452 0.2870 9.2171 0.0000 2.0826 3.2077
age_group65+ 2.8828 0.2740 10.5221 0.0000 2.3457 3.4198
smokingCurrent 1.1503 0.2964 3.8807 0.0001 0.5692 1.7313
smokingFormer 0.6768 0.2052 3.2977 0.0010 0.2745 1.0791
bmi 0.0480 0.0140 3.4339 0.0006 0.0206 0.0754
educationCollege Graduate -1.2319 0.4531 -2.7188 0.0066 -2.1201 -0.3437
educationHS/GED -1.1978 0.4508 -2.6567 0.0079 -2.0815 -0.3140
educationSome College -0.7178 0.4525 -1.5863 0.1127 -1.6048 0.1692

Interpretation: The three methods all agreed on the same model. All of the methods got rid of the sex predictor. Mentally unhealthy days, exercise, income, age group, smoking status, BMI, and education level were all preserved by all three methods of model selection.

tidy(mod_stepwise, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Stepwise Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Stepwise Selection Result (AIC-based)
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 5.8000 0.6530 8.8821 0.0000 4.5200 7.0800
menthlth_days 0.3042 0.0114 26.7352 0.0000 0.2819 0.3265
exerciseYes -3.4505 0.2253 -15.3127 0.0000 -3.8922 -3.0088
income$100K+ -3.8271 0.4303 -8.8949 0.0000 -4.6705 -2.9837
income$25K-$49K -2.6241 0.3073 -8.5398 0.0000 -3.2264 -2.0217
income$50K-$99K -3.2385 0.2952 -10.9701 0.0000 -3.8172 -2.6598
age_group35-49 1.3477 0.2981 4.5215 0.0000 0.7634 1.9320
age_group50-64 2.6452 0.2870 9.2171 0.0000 2.0826 3.2077
age_group65+ 2.8828 0.2740 10.5221 0.0000 2.3457 3.4198
smokingCurrent 1.1503 0.2964 3.8807 0.0001 0.5692 1.7313
smokingFormer 0.6768 0.2052 3.2977 0.0010 0.2745 1.0791
bmi 0.0480 0.0140 3.4339 0.0006 0.0206 0.0754
educationCollege Graduate -1.2319 0.4531 -2.7188 0.0066 -2.1201 -0.3437
educationHS/GED -1.1978 0.4508 -2.6567 0.0079 -2.0815 -0.3140
educationSome College -0.7178 0.4525 -1.5863 0.1127 -1.6048 0.1692

Interpretation: Since all three methods of model selection chose the same model, I am choosing the model all three created for my final model. For every one unit increase in mentally unhealthy days, physically unhealthy days will increase by 0.3042 days, holding all other variables constant. For every one unit increase in BMI, physically unhealthy days will increase by 0.0480 days, holding all other variables constant. Compared those who don’t exercise, those who do exercise have 3.4505 fewer physically unhealthy days, holding all other variables constant.

par(mfrow = c(2, 2))
plot(mod_stepwise)

par(mfrow = c(1, 1))

Interpretation: Residuals vs. Fitted: The red line is not exactly horizontal, meaning the data aren’t perfectly linear. In addition, there may be some evidence of heteroscedasticity, as the data don’t seem to be exactly randomly scattered. Q-Q Residuals: There are deviations from the normality line, meaning the data aren’t exactly normally distributed. The largest deviation is on the right side of the tail. Scale-Location: There is large evidence of heteroscedasticity here, as the red line is not horizontal. The variances are not equal. Residuals vs Leverage: There are many observations beyond Cook’s distance, meaning there are potentially highly influential observations within the dataset.

The LINE assumptions are violated in many ways. However, the sample size is so large these deviations may not cause issues throughout the analysis. There is evidence that the data is not linear, the variances aren’t equally, it isn’t normally distributed, and that independence is violated.

Part 2: Logistic Regression

brfss_log <- brfss_analytic |>
  mutate(fpd = factor(
      ifelse(physhlth_days >= 14, 1, 0),
      levels = c(0, 1),
      labels = c("No", "Yes")
    ))
brfss_log %>%
  select(fpd) %>%
  tbl_summary(
    type = list(fpd ~ "categorical"),
    label = list(
      fpd ~ "Frequent Physical Distress"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    )) %>%
  add_n() %>%
  bold_labels() %>%
  modify_caption("**Frequent Physical Distress Descriptive Statistics**")
Frequent Physical Distress Descriptive Statistics
Characteristic N N = 8,0001
Frequent Physical Distress 8,000
    No
6,914 (86%)
    Yes
1,086 (14%)
1 n (%)

Interpretation: There are many more people who don’t experience frequent physical distress than people who do. 86% of people in the dataset do not experience it, and 14% of people do.

mod_simple <- glm(fpd ~ bmi,
 data = brfss_log, family = binomial)

# Model Summary
summary(mod_simple)
## 
## Call:
## glm(formula = fpd ~ bmi, family = binomial, data = brfss_log)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.771987   0.141013 -19.658  < 2e-16 ***
## bmi          0.031542   0.004619   6.829 8.57e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6354.8  on 7999  degrees of freedom
## Residual deviance: 6310.3  on 7998  degrees of freedom
## AIC: 6314.3
## 
## Number of Fisher Scoring iterations: 4
exp(coef(mod_simple)) # Odds ratio
## (Intercept)         bmi 
##  0.06253761  1.03204520
exp(confint(mod_simple)) # 95% CI for OR
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept) 0.04743404 0.08246141
## bmi         1.02269073 1.04138719

Interpretation: For every one-unit increase in BMI, the odds of frequent physical distress are multiplied by 1.032, 95% CI [1.023, 1.041]. The association is statistically significant, as the p-value from the Wald test is 0, which is less than 0.05, meaning you can reject the null hypothesis that there is no association between BMI and frequent physical distress.

mod_logistic <- glm(fpd ~ menthlth_days + exercise + income + age_group + smoking + bmi + education,
 data = brfss_log, family = binomial)

summary(mod_logistic)
## 
## Call:
## glm(formula = fpd ~ menthlth_days + exercise + income + age_group + 
##     smoking + bmi + education, family = binomial, data = brfss_log)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -2.249364   0.241714  -9.306  < 2e-16 ***
## menthlth_days              0.068264   0.003648  18.712  < 2e-16 ***
## exerciseYes               -0.885478   0.076964 -11.505  < 2e-16 ***
## income$100K+              -1.502512   0.229493  -6.547 5.87e-11 ***
## income$25K-$49K           -0.590324   0.101531  -5.814 6.09e-09 ***
## income$50K-$99K           -0.855241   0.100385  -8.520  < 2e-16 ***
## age_group35-49             0.763898   0.147572   5.176 2.26e-07 ***
## age_group50-64             1.214414   0.139734   8.691  < 2e-16 ***
## age_group65+               1.293078   0.136943   9.442  < 2e-16 ***
## smokingCurrent             0.357308   0.105325   3.392 0.000693 ***
## smokingFormer              0.294754   0.081332   3.624 0.000290 ***
## bmi                        0.014360   0.005155   2.786 0.005344 ** 
## educationCollege Graduate -0.493111   0.154804  -3.185 0.001446 ** 
## educationHS/GED           -0.335329   0.148966  -2.251 0.024383 *  
## educationSome College     -0.146800   0.149691  -0.981 0.326745    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6354.8  on 7999  degrees of freedom
## Residual deviance: 5280.6  on 7985  degrees of freedom
## AIC: 5310.6
## 
## Number of Fisher Scoring iterations: 6
# Table
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3,
        caption = "Logistic Regression: FMD ~ menthlth_days + exercise + income + age_group + smoking + bmi + education (Odds Ratio Scale)") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Logistic Regression: FMD ~ menthlth_days + exercise + income + age_group + smoking + bmi + education (Odds Ratio Scale)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.105 0.242 -9.306 0.000 0.065 0.169
menthlth_days 1.071 0.004 18.712 0.000 1.063 1.078
exerciseYes 0.413 0.077 -11.505 0.000 0.355 0.480
income$100K+ 0.223 0.229 -6.547 0.000 0.139 0.343
income$25K-$49K 0.554 0.102 -5.814 0.000 0.454 0.676
income$50K-$99K 0.425 0.100 -8.520 0.000 0.349 0.518
age_group35-49 2.147 0.148 5.176 0.000 1.613 2.878
age_group50-64 3.368 0.140 8.691 0.000 2.573 4.452
age_group65+ 3.644 0.137 9.442 0.000 2.800 4.792
smokingCurrent 1.429 0.105 3.392 0.001 1.161 1.755
smokingFormer 1.343 0.081 3.624 0.000 1.144 1.574
bmi 1.014 0.005 2.786 0.005 1.004 1.025
educationCollege Graduate 0.611 0.155 -3.185 0.001 0.452 0.830
educationHS/GED 0.715 0.149 -2.251 0.024 0.535 0.961
educationSome College 0.863 0.150 -0.981 0.327 0.646 1.162

Interpretation: For everyone 1 unit increase in mentally unhealthy days, the odds of frequent physical distress are multiplied by 1.071, holding all other variables constant. For every 1 unit increase in bmi, the odds of frequent physical distress are multiplied by 1.014, holding all other variables constant. Compared to those who don’t exercise, those who do have 58.7% decreased odds of having frequent physical distress, holding all other variables constant.

mod_reduced <- glm(fpd ~ menthlth_days + exercise + age_group + smoking + bmi + education,
 data = brfss_log, family = binomial)

# Liklihood Ratio Test 
anova(mod_reduced, mod_logistic, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: fpd ~ menthlth_days + exercise + age_group + smoking + bmi + 
##     education
## Model 2: fpd ~ menthlth_days + exercise + income + age_group + smoking + 
##     bmi + education
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7988     5367.9                          
## 2      7985     5280.6  3   87.353 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation: The null hypothesis of the LRT is that income has no effect on the model predicting frequent physical distress, and the alternative hypothesis is that income does have an effect on the model predicting frequent physical distress. The test statistic is 87.353, the degrees of freedom are 3, and the p-value is < 2.2e-16. Since the p-value is less than 0.05, you reject the null hypothesis, meaning income does help to predict frequent mental distress.

roc_obj <- roc(brfss_log$fpd, fitted(mod_logistic))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
 print.auc = TRUE)

auc(roc_obj)
## Area under the curve: 0.7773
ci.auc(roc_obj)
## 95% CI: 0.7618-0.7927 (DeLong)

Interpretation: The AUC of the model is 0.777, meaning the model has acceptable discrimination between those with frequent physical distress and those without frequent physical distress.

hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 10.713, df = 8, p-value = 0.2185

Interpretation: The null hypothesis is that the model fits the data well and the alternative hypothesis is that the model does not fit the data well. The t-statistic is 10.713, the degrees of freedom are 8, and the p-value is 0.2185. Since the p-value is greater than 0.05, you fail to reject the null hypothesis that the model doesn’t fit the data well. The model is not a poor fit for the data. The Hosmer-Lemeshow test complements the ROC/AUC findings well, because it shows that the model fits the data well (there’s agreement between predicted and observed events), but that it can also discriminate between “cases” and “non-cases”. It is good to test both because models can have good discrimination and poor fit, or vice versa.

Part 3: Reflection There are many factors that play a role in the physical health burden among US adults. These factors include mentally unhealthy days, sex, exercise status, age group, income, education level, BMI, smoking status, and I’m sure others, as well. It would be difficult to perfectly predict whether a person has frequent physical distress, as there are so many factors that play a role in determining it. Age group and income had the strongest association in both the linear and logistic models. All of the predictors that were significant in the linear model were also significant in the logistic model. EducationSomeCollege was insignificant in both the linear and logistic model. The key limitations of using cross-sectional survey data is not being able to establish temporality. Because the study is cross-sectional, all of the data is collected at the same time, so you can’t establish whether one factor caused another factor because you don’t know which one came first. Two potential confounders that were not measured in the model could be whether a person has a condition like Multiple Sclerosis, which could impact their physical ability, and a variable such as the general health of a person. The linear regression gives us a tangible number of physically unhealthy days that each person of a certain characteristic is predicted to have, and logistic regression gives you the odds of a person having frequent physical distress based on a certain characteristic they possess. Linear regression gives us a predicted number of units that a person will possess for a certain characteristic and logistic regression gives us a predicted odds of developing/having a certain condition.You would prefer linear regression when you have a continuous outcome, and logistic regression when you have a binary outcome. Linear regression is better when you want a specifically predicted number a person will have, and logistic regression is better than you want more of a predicted probability/odds. R-squared is the percentage of variability in the outcome that can be explained by the model. AUC determines how well a model can discriminate between a “case” and “non-case”. Both are based on the outcome variable and how well the model can help to predict the outcome, but since the outcome variables for models are on different scales, the criteria and measures of model selection are different and measure different things. I did not use any AI for this assignment.