Computer Homework 4: Model Selection and Introduction to Logistic Regression

Loading necessary packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(haven)
library(broom)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(gtsummary)
library(leaps)
library(pROC)

## Warning: package 'pROC' was built under R version 4.5.3

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(ResourceSelection)

## Warning: package 'ResourceSelection' was built under R version 4.5.3

## ResourceSelection 0.3-6   2023-06-27

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

## Setting theme "Compact"
## Setting theme "Compact"

Part 0: Data Preparation

brfss <- read_xpt("C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/New folder/LLCP2023XPT/LLCP2023.XPT")
 
dim(brfss)

## [1] 433323    350

names(brfss)

##   [1] "_STATE"   "FMONTH"   "IDATE"    "IMONTH"   "IDAY"     "IYEAR"   
##   [7] "DISPCODE" "SEQNO"    "_PSU"     "CTELENM1" "PVTRESD1" "COLGHOUS"
##  [13] "STATERE1" "CELPHON1" "LADULT1"  "NUMADULT" "RESPSLC1" "LANDSEX2"
##  [19] "LNDSXBRT" "SAFETIME" "CTELNUM1" "CELLFON5" "CADULT1"  "CELLSEX2"
##  [25] "CELSXBRT" "PVTRESD3" "CCLGHOUS" "CSTATE1"  "LANDLINE" "HHADULT" 
##  [31] "SEXVAR"   "GENHLTH"  "PHYSHLTH" "MENTHLTH" "POORHLTH" "PRIMINS1"
##  [37] "PERSDOC3" "MEDCOST1" "CHECKUP1" "EXERANY2" "EXRACT12" "EXEROFT1"
##  [43] "EXERHMM1" "EXRACT22" "EXEROFT2" "EXERHMM2" "STRENGTH" "BPHIGH6" 
##  [49] "BPMEDS1"  "CHOLCHK3" "TOLDHI3"  "CHOLMED3" "CVDINFR4" "CVDCRHD4"
##  [55] "CVDSTRK3" "ASTHMA3"  "ASTHNOW"  "CHCSCNC1" "CHCOCNC1" "CHCCOPD3"
##  [61] "ADDEPEV3" "CHCKDNY2" "HAVARTH4" "DIABETE4" "DIABAGE4" "MARITAL" 
##  [67] "EDUCA"    "RENTHOM1" "NUMHHOL4" "NUMPHON4" "CPDEMO1C" "VETERAN3"
##  [73] "EMPLOY1"  "CHILDREN" "INCOME3"  "PREGNANT" "WEIGHT2"  "HEIGHT3" 
##  [79] "DEAF"     "BLIND"    "DECIDE"   "DIFFWALK" "DIFFDRES" "DIFFALON"
##  [85] "FALL12MN" "FALLINJ5" "SMOKE100" "SMOKDAY2" "USENOW3"  "ECIGNOW2"
##  [91] "ALCDAY4"  "AVEDRNK3" "DRNK3GE5" "MAXDRNKS" "FLUSHOT7" "FLSHTMY3"
##  [97] "PNEUVAC4" "SHINGLE2" "HIVTST7"  "HIVTSTD3" "SEATBELT" "DRNKDRI2"
## [103] "COVIDPO1" "COVIDSM1" "COVIDACT" "PDIABTS1" "PREDIAB2" "DIABTYPE"
## [109] "INSULIN1" "CHKHEMO3" "EYEEXAM1" "DIABEYE1" "DIABEDU1" "FEETSORE"
## [115] "ARTHEXER" "ARTHEDU"  "LMTJOIN3" "ARTHDIS2" "JOINPAI2" "LCSFIRST"
## [121] "LCSLAST"  "LCSNUMCG" "LCSCTSC1" "LCSSCNCR" "LCSCTWHN" "HADMAM"  
## [127] "HOWLONG"  "CERVSCRN" "CRVCLCNC" "CRVCLPAP" "CRVCLHPV" "HADHYST2"
## [133] "PSATEST1" "PSATIME1" "PCPSARS2" "PSASUGS1" "PCSTALK2" "HADSIGM4"
## [139] "COLNSIGM" "COLNTES1" "SIGMTES1" "LASTSIG4" "COLNCNCR" "VIRCOLO1"
## [145] "VCLNTES2" "SMALSTOL" "STOLTEST" "STOOLDN2" "BLDSTFIT" "SDNATES1"
## [151] "CNCRDIFF" "CNCRAGE"  "CNCRTYP2" "CSRVTRT3" "CSRVDOC1" "CSRVSUM" 
## [157] "CSRVRTRN" "CSRVINST" "CSRVINSR" "CSRVDEIN" "CSRVCLIN" "CSRVPAIN"
## [163] "CSRVCTL2" "INDORTAN" "NUMBURN3" "SUNPRTCT" "WKDAYOUT" "WKENDOUT"
## [169] "CIMEMLO1" "CDWORRY"  "CDDISCU1" "CDHOUS1"  "CDSOCIA1" "CAREGIV1"
## [175] "CRGVREL4" "CRGVLNG1" "CRGVHRS1" "CRGVPRB3" "CRGVALZD" "CRGVPER1"
## [181] "CRGVHOU1" "CRGVEXPT" "LASTSMK2" "STOPSMK2" "MENTCIGS" "MENTECIG"
## [187] "HEATTBCO" "FIREARM5" "GUNLOAD"  "LOADULK2" "HASYMP1"  "HASYMP2" 
## [193] "HASYMP3"  "HASYMP4"  "HASYMP5"  "HASYMP6"  "STRSYMP1" "STRSYMP2"
## [199] "STRSYMP3" "STRSYMP4" "STRSYMP5" "STRSYMP6" "FIRSTAID" "ASPIRIN" 
## [205] "BIRTHSEX" "SOMALE"   "SOFEMALE" "TRNSGNDR" "MARIJAN1" "MARJSMOK"
## [211] "MARJEAT"  "MARJVAPE" "MARJDAB"  "MARJOTHR" "USEMRJN4" "ACEDEPRS"
## [217] "ACEDRINK" "ACEDRUGS" "ACEPRISN" "ACEDIVRC" "ACEPUNCH" "ACEHURT1"
## [223] "ACESWEAR" "ACETOUCH" "ACETTHEM" "ACEHVSEX" "ACEADSAF" "ACEADNED"
## [229] "IMFVPLA4" "HPVADVC4" "HPVADSHT" "TETANUS1" "COVIDVA1" "COVACGE1"
## [235] "COVIDNU2" "LSATISFY" "EMTSUPRT" "SDLONELY" "SDHEMPLY" "FOODSTMP"
## [241] "SDHFOOD1" "SDHBILLS" "SDHUTILS" "SDHTRNSP" "SDHSTRE1" "RRCLASS3"
## [247] "RRCOGNT2" "RRTREAT"  "RRATWRK2" "RRHCARE4" "RRPHYSM2" "RCSGEND1"
## [253] "RCSXBRTH" "RCSRLTN2" "CASTHDX2" "CASTHNO2" "QSTVER"   "QSTLANG" 
## [259] "_METSTAT" "_URBSTAT" "MSCODE"   "_STSTR"   "_STRWT"   "_RAWRAKE"
## [265] "_WT2RAKE" "_IMPRACE" "_CHISPNC" "_CRACE1"  "CAGEG"    "_CLLCPWT"
## [271] "_DUALUSE" "_DUALCOR" "_LLCPWT2" "_LLCPWT"  "_RFHLTH"  "_PHYS14D"
## [277] "_MENT14D" "_HLTHPL1" "_HCVU653" "_TOTINDA" "METVL12_" "METVL22_"
## [283] "MAXVO21_" "FC601_"   "ACTIN13_" "ACTIN23_" "PADUR1_"  "PADUR2_" 
## [289] "PAFREQ1_" "PAFREQ2_" "_MINAC12" "_MINAC22" "STRFREQ_" "PAMISS3_"
## [295] "PAMIN13_" "PAMIN23_" "PA3MIN_"  "PAVIG13_" "PAVIG23_" "PA3VIGM_"
## [301] "_PACAT3"  "_PAINDX3" "_PA150R4" "_PA300R4" "_PA30023" "_PASTRNG"
## [307] "_PAREC3"  "_PASTAE3" "_RFHYPE6" "_CHOLCH3" "_RFCHOL3" "_MICHD"  
## [313] "_LTASTH1" "_CASTHM1" "_ASTHMS1" "_DRDXAR2" "_MRACE1"  "_HISPANC"
## [319] "_RACE"    "_RACEG21" "_RACEGR3" "_RACEPRV" "_SEX"     "_AGEG5YR"
## [325] "_AGE65YR" "_AGE80"   "_AGE_G"   "HTIN4"    "HTM4"     "WTKG3"   
## [331] "_BMI5"    "_BMI5CAT" "_RFBMI5"  "_CHLDCNT" "_EDUCAG"  "_INCOMG1"
## [337] "_SMOKER3" "_RFSMOK3" "_CURECI2" "DRNKANY6" "DROCDY4_" "_RFBING6"
## [343] "_DRNKWK2" "_RFDRHV8" "_FLSHOT7" "_PNEUMO3" "_AIDTST4" "_RFSEAT2"
## [349] "_RFSEAT3" "_DRNKDRV"

Recoding the new dataset

brfss_clean <- brfss |>
  select(
    PHYSHLTH, MENTHLTH, `_BMI5`, SEXVAR, EXERANY2,
    ADDEPEV3, `_AGEG5YR`, `_INCOMG1`, EDUCA, `_SMOKER3`, GENHLTH
  ) |>
  mutate(
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      TRUE ~ as.numeric(PHYSHLTH)
    ),
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      TRUE ~ as.numeric(MENTHLTH)
    ),
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ `_BMI5` / 100
    ),
    sex = factor(SEXVAR, levels = c(1, 2), labels = c("Male", "Female")),
    exercise = factor(EXERANY2, levels = c(2, 1), labels = c("No", "Yes")),
    depression = factor(ADDEPEV3, levels = c(2, 1), labels = c("No", "Yes")),
    age_group = factor(case_when(
      `_AGEG5YR` %in% 1:3 ~ "18-34",
      `_AGEG5YR` %in% 4:6 ~ "35-49",
      `_AGEG5YR` %in% 7:9 ~ "50-64",
      `_AGEG5YR` %in% 10:13 ~ "65+",
      TRUE ~ NA_character_
    )),
    income = factor(case_when(
      `_INCOMG1` %in% 1:2 ~ "Less than $25K",
      `_INCOMG1` %in% 3:4 ~ "$25K-$49K",
      `_INCOMG1` %in% 5:6 ~ "$50K-$99K",
      `_INCOMG1` == 7 ~ "$100K+",
      TRUE ~ NA_character_
    )),
    education = factor(case_when(
      EDUCA %in% 1:3 ~ "Less than HS",
      EDUCA == 4 ~ "HS/GED",
      EDUCA == 5 ~ "Some college",
      EDUCA == 6 ~ "College graduate",
      TRUE ~ NA_character_
    )),
    smoking = factor(case_when(
      `_SMOKER3` %in% 1:2 ~ "Current",
      `_SMOKER3` == 3 ~ "Former",
      `_SMOKER3` == 4 ~ "Never",
      TRUE ~ NA_character_
    )),
    gen_health = factor(case_when(
      GENHLTH %in% 1:2 ~ "Excellent/Very Good",
      GENHLTH == 3 ~ "Good",
      GENHLTH %in% 4:5 ~ "Fair/Poor",
      TRUE ~ NA_character_
    ))
  ) |>
  select(
    physhlth_days, menthlth_days, bmi, sex, exercise,
    depression, age_group, income, education, smoking, gen_health
  )

Setting Reference Level

brfss_clean <- brfss_clean |>
  mutate(
    sex = relevel(sex, ref = "Male"),
    exercise = relevel(exercise, ref = "No"),
    depression = relevel(depression, ref = "No"),
    age_group = relevel(age_group, ref = "18-34"),
    income = relevel(income, ref = "Less than $25K"),
    education = relevel(education, ref = "Less than HS"),
    smoking = relevel(smoking, ref = "Never"),
    gen_health = relevel(gen_health, ref = "Excellent/Very Good")
  )

Missing data handing

# Missing summary
summary(is.na(brfss_clean$physhlth_days))

##    Mode   FALSE    TRUE 
## logical  422538   10785

# Final dataset
set.seed(1220)

brfss_analytic <- brfss_clean |>
  drop_na() |>
  slice_sample(n = 8000)

nrow(brfss_analytic)

## [1] 8000

tbl_summary(brfss_analytic)

Characteristic	N = 8,000¹
physhlth_days	0 (0, 4)
menthlth_days	0 (0, 5)
bmi	27 (24, 32)
sex
Male	3,905 (49%)
Female	4,095 (51%)
exercise	6,224 (78%)
depression	1,694 (21%)
age_group
18-34	1,328 (17%)
35-49	1,650 (21%)
50-64	2,145 (27%)
65+	2,877 (36%)
income
Less than $25K	1,101 (14%)
$100K+	631 (7.9%)
$25K-$49K	1,927 (24%)
$50K-$99K	4,341 (54%)
education
Less than HS	384 (4.8%)
College graduate	3,608 (45%)
HS/GED	1,941 (24%)
Some college	2,067 (26%)
smoking
Never	4,884 (61%)
Current	855 (11%)
Former	2,261 (28%)
gen_health
Excellent/Very Good	3,990 (50%)
Fair/Poor	1,387 (17%)
Good	2,623 (33%)
¹ Median (Q1, Q3); n (%)

Part 1: Model Selection

# Step 1: Maximum model
mod_max <- lm(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health,
  data = brfss_analytic
)

tidy(mod_max, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Maximum Model: All Candidate Predictors",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Maximum Model: All Candidate Predictors
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	1.3194	0.5781	2.2825	0.0225	0.1863	2.4526
menthlth_days	0.1991	0.0109	18.2713	0.0000	0.1777	0.2204
bmi	-0.0010	0.0123	-0.0811	0.9354	-0.0250	0.0230
sexFemale	0.2382	0.1583	1.5049	0.1324	-0.0721	0.5484
exerciseYes	-2.1206	0.1996	-10.6241	0.0000	-2.5118	-1.7293
depressionYes	0.3477	0.2126	1.6351	0.1021	-0.0691	0.7645
age_group35-49	0.3536	0.2601	1.3591	0.1741	-0.1564	0.8635
age_group50-64	1.5652	0.2484	6.3017	0.0000	1.0783	2.0521
age_group65+	1.6307	0.2425	6.7239	0.0000	1.1553	2.1062
income$100K+	-0.9897	0.3826	-2.5870	0.0097	-1.7397	-0.2398
income$25K-$49K	-0.4766	0.2670	-1.7850	0.0743	-1.0000	0.0468
income$50K-$99K	-1.1806	0.2594	-4.5508	0.0000	-1.6891	-0.6720
educationCollege graduate	1.3387	0.4028	3.3236	0.0009	0.5491	2.1282
educationHS/GED	0.8888	0.3931	2.2609	0.0238	0.1182	1.6594
educationSome college	1.1012	0.3978	2.7682	0.0056	0.3214	1.8811
smokingCurrent	0.3991	0.2697	1.4794	0.1391	-0.1297	0.9278
smokingFormer	0.2341	0.1818	1.2874	0.1980	-0.1223	0.5905
gen_healthFair/Poor	10.0607	0.2495	40.3167	0.0000	9.5715	10.5499
gen_healthGood	1.1876	0.1810	6.5628	0.0000	0.8328	1.5423

# Model fit statistics

glance(mod_max) |>
  dplyr::select(r.squared, adj.r.squared, sigma, AIC, BIC, df.residual) |>
  mutate(across(everything(), \(x) round(x, 3))) |>
  kable(caption = "Maximum Model: Fit Statistics") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Maximum Model: Fit Statistics
r.squared	adj.r.squared	sigma	AIC	BIC	df.residual
0.347	0.345	6.891	53606.97	53746.71	7981

# Step 2: Best subsets regression

best_subsets <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health,
  data = brfss_analytic,
  nvmax = ncol(model.matrix(
    physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
      age_group + income + education + smoking + gen_health,
    data = brfss_analytic
  )) - 1,
  method = "exhaustive"
)

best_summary <- summary(best_subsets)

subset_metrics <- tibble(
  p = 1:length(best_summary$adjr2),
  `Adj. R²` = best_summary$adjr2,
  BIC = best_summary$bic,
  Cp = best_summary$cp
)

# Adjusted R2 plot

p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.max(best_summary$adjr2),
             linetype = "dashed", color = "tomato") +
  labs(
    title = "Adjusted R² by Model Size",
    x = "Number of Predictors",
    y = "Adjusted R²"
  ) +
  theme_minimal(base_size = 12)

# BIC Plot

p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
  geom_line(linewidth = 1, color = "steelblue") +
  geom_point(size = 3, color = "steelblue") +
  geom_vline(xintercept = which.min(best_summary$bic),
             linetype = "dashed", color = "tomato") +
  labs(
    title = "BIC by Model Size",
    x = "Number of Predictors",
    y = "BIC"
  ) +
  theme_minimal(base_size = 12)

# Both plots side-by-side

gridExtra::grid.arrange(p1, p2, ncol = 2)

which.max(best_summary$adjr2)

## [1] 17

which.min(best_summary$bic)

## [1] 7

coef(best_subsets, which.max(best_summary$adjr2))

##               (Intercept)             menthlth_days                 sexFemale 
##                 1.2922232                 0.1990913                 0.2384094 
##               exerciseYes             depressionYes            age_group35-49 
##                -2.1184627                 0.3464440                 0.3520047 
##            age_group50-64              age_group65+              income$100K+ 
##                 1.5637608                 1.6310257                -0.9895508 
##           income$25K-$49K           income$50K-$99K educationCollege graduate 
##                -0.4772550                -1.1814628                 1.3387368 
##           educationHS/GED     educationSome college            smokingCurrent 
##                 0.8880462                 1.1002997                 0.4010414 
##             smokingFormer       gen_healthFair/Poor            gen_healthGood 
##                 0.2341107                10.0574585                 1.1851928

coef(best_subsets, which.min(best_summary$bic))

##         (Intercept)       menthlth_days         exerciseYes      age_group50-64 
##           2.2684693           0.2104121          -2.0985245           1.4139167 
##        age_group65+     income$50K-$99K gen_healthFair/Poor      gen_healthGood 
##           1.5365529          -0.6232997          10.1107030           1.2085543

cat("Best model by Adj. R²:", which.max(best_summary$adjr2), "variables\n")

## Best model by Adj. R²: 17 variables

cat("Best model by BIC:", which.min(best_summary$bic), "variables\n")

## Best model by BIC: 7 variables

best_bic_idx <- which.min(best_summary$bic)
best_vars <- names(which(best_summary$which[best_bic_idx, -1]))
cat("\nVariables in BIC-best model:\n")

## 
## Variables in BIC-best model:

cat(paste(" ", best_vars), sep = "\n")

##   menthlth_days
##   exerciseYes
##   age_group50-64
##   age_group65+
##   income$50K-$99K
##   gen_healthFair/Poor
##   gen_healthGood

# Step 3: Stepwise selection methods

# Backward elimination

mod_back <- mod_max
cat("Step 1: Maximum model\n")

## Step 1: Maximum model

cat("Variables:", paste(names(coef(mod_back))[-1], collapse = ", "), "\n")

## Variables: menthlth_days, bmi, sexFemale, exerciseYes, depressionYes, age_group35-49, age_group50-64, age_group65+, income$100K+, income$25K-$49K, income$50K-$99K, educationCollege graduate, educationHS/GED, educationSome college, smokingCurrent, smokingFormer, gen_healthFair/Poor, gen_healthGood

pvals <- tidy(mod_back) |>
  filter(term != "(Intercept)") |>
  arrange(desc(p.value)) |>
  dplyr::select(term, estimate, p.value) |>
  mutate(across(where(is.numeric), \(x) round(x, 4)))

pvals |>
  head(5) |>
  kable(caption = "Maximum Model: Variables Sorted by p-value (Highest First)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Maximum Model: Variables Sorted by p-value (Highest First)
term	estimate	p.value
bmi	-0.0010	0.9354
smokingFormer	0.2341	0.1980
age_group35-49	0.3536	0.1741
smokingCurrent	0.3991	0.1391
sexFemale	0.2382	0.1324

mod_backward <- step(mod_max, direction = "backward", trace = 1)

## Start:  AIC=30901.95
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## - bmi            1         0 378962 30900
## - smoking        2       147 379109 30901
## <none>                       378962 30902
## - sex            1       108 379069 30902
## - depression     1       127 379089 30903
## - education      3       585 379547 30908
## - income         3      1184 380145 30921
## - age_group      3      3448 382410 30968
## - exercise       1      5360 384321 31012
## - menthlth_days  1     15852 394814 31228
## - gen_health     2     82226 461188 32469
## 
## Step:  AIC=30899.96
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## - smoking        2       149 379111 30899
## <none>                       378962 30900
## - sex            1       108 379070 30900
## - depression     1       127 379089 30901
## - education      3       586 379548 30906
## - income         3      1187 380149 30919
## - age_group      3      3451 382413 30966
## - exercise       1      5440 384403 31012
## - menthlth_days  1     15856 394818 31226
## - gen_health     2     83608 462570 32491
## 
## Step:  AIC=30899.1
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## - sex            1        86 379197 30899
## <none>                       379111 30899
## - depression     1       143 379254 30900
## - education      3       518 379629 30904
## - income         3      1226 380337 30919
## - age_group      3      3677 382788 30970
## - exercise       1      5462 384573 31012
## - menthlth_days  1     16186 395297 31232
## - gen_health     2     84040 463151 32497
## 
## Step:  AIC=30898.91
## physhlth_days ~ menthlth_days + exercise + depression + age_group + 
##     income + education + gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       379197 30899
## - depression     1       174 379371 30901
## - education      3       551 379748 30905
## - income         3      1299 380496 30920
## - age_group      3      3726 382923 30971
## - exercise       1      5512 384709 31012
## - menthlth_days  1     16208 395405 31232
## - gen_health     2     83954 463151 32495

tidy(mod_backward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Backward Elimination Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Backward Elimination Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	1.5069	0.4596	3.2785	0.0010	0.6059	2.4079
menthlth_days	0.2006	0.0109	18.4742	0.0000	0.1793	0.2219
exerciseYes	-2.1309	0.1978	-10.7739	0.0000	-2.5186	-1.7432
depressionYes	0.4018	0.2101	1.9122	0.0559	-0.0101	0.8137
age_group35-49	0.4210	0.2573	1.6361	0.1019	-0.0834	0.9253
age_group50-64	1.6359	0.2454	6.6667	0.0000	1.1549	2.1170
age_group65+	1.7003	0.2381	7.1401	0.0000	1.2335	2.1671
income$100K+	-1.0655	0.3800	-2.8037	0.0051	-1.8104	-0.3205
income$25K-$49K	-0.4877	0.2666	-1.8292	0.0674	-1.0103	0.0350
income$50K-$99K	-1.2234	0.2576	-4.7495	0.0000	-1.7283	-0.7185
educationCollege graduate	1.2925	0.3998	3.2330	0.0012	0.5088	2.0762
educationHS/GED	0.8799	0.3926	2.2414	0.0250	0.1104	1.6494
educationSome college	1.0942	0.3966	2.7590	0.0058	0.3168	1.8717
gen_healthFair/Poor	10.0658	0.2458	40.9539	0.0000	9.5840	10.5476
gen_healthGood	1.1949	0.1782	6.7058	0.0000	0.8456	1.5442

# Automated forward selection using AIC
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)

mod_forward <- step(mod_null,
                    scope = list(lower = mod_null, upper = mod_max),
                    direction = "forward", trace = 1)

## Start:  AIC=34269.45
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    168850 411060 31520
## + menthlth_days  1     66457 513453 33298
## + exercise       1     38219 541691 33726
## + income         3     30450 549460 33844
## + depression     1     21930 557980 33963
## + education      3     10761 569149 34126
## + smoking        2      9915 569995 34135
## + bmi            1      7925 571985 34161
## + age_group      3      7295 572614 34174
## + sex            1      1400 578510 34252
## <none>                       579910 34269
## 
## Step:  AIC=31520.38
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1   19271.4 391789 31138
## + exercise       1    7169.1 403891 31382
## + depression     1    4595.4 406465 31432
## + income         3    3091.4 407969 31466
## + age_group      3    1764.5 409295 31492
## + smoking        2    1068.3 409992 31504
## + sex            1     875.5 410185 31505
## <none>                       411060 31520
## + bmi            1      89.0 410971 31521
## + education      3     206.8 410853 31522
## 
## Step:  AIC=31138.25
## physhlth_days ~ gen_health + menthlth_days
## 
##              Df Sum of Sq    RSS   AIC
## + exercise    1    6765.4 385023 31001
## + age_group   3    5018.7 386770 31041
## + income      3    1818.7 389970 31107
## + sex         1     445.3 391343 31131
## + smoking     2     446.8 391342 31133
## + depression  1     228.1 391560 31136
## <none>                    391789 31138
## + bmi         1      48.3 391740 31139
## + education   3     137.1 391651 31141
## 
## Step:  AIC=31000.9
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##              Df Sum of Sq    RSS   AIC
## + age_group   3    4094.6 380929 30921
## + income      3    1057.5 383966 30985
## + sex         1     318.7 384704 30996
## + depression  1     219.6 384804 30998
## + smoking     2     296.4 384727 30999
## + education   3     346.2 384677 31000
## <none>                    385023 31001
## + bmi         1      12.0 385011 31003
## 
## Step:  AIC=30921.36
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##              Df Sum of Sq    RSS   AIC
## + income      3    954.84 379974 30907
## + depression  1    259.79 380669 30918
## + sex         1    224.36 380704 30919
## <none>                    380929 30921
## + smoking     2    127.87 380801 30923
## + education   3    197.55 380731 30923
## + bmi         1      3.02 380926 30923
## 
## Step:  AIC=30907.29
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##              Df Sum of Sq    RSS   AIC
## + education   3    603.13 379371 30901
## + depression  1    226.05 379748 30905
## + sex         1    160.83 379813 30906
## <none>                    379974 30907
## + bmi         1      0.31 379973 30909
## + smoking     2     69.38 379904 30910
## 
## Step:  AIC=30900.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + education
## 
##              Df Sum of Sq    RSS   AIC
## + depression  1   173.649 379197 30899
## + sex         1   116.600 379254 30900
## <none>                    379371 30901
## + smoking     2   140.198 379230 30902
## + bmi         1     0.347 379370 30903
## 
## Step:  AIC=30898.91
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + education + depression
## 
##           Df Sum of Sq    RSS   AIC
## <none>                 379197 30899
## + sex      1    86.111 379111 30899
## + smoking  2   126.998 379070 30900
## + bmi      1     2.214 379195 30901

tidy(mod_forward, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Forward Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Forward Selection Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	1.5069	0.4596	3.2785	0.0010	0.6059	2.4079
gen_healthFair/Poor	10.0658	0.2458	40.9539	0.0000	9.5840	10.5476
gen_healthGood	1.1949	0.1782	6.7058	0.0000	0.8456	1.5442
menthlth_days	0.2006	0.0109	18.4742	0.0000	0.1793	0.2219
exerciseYes	-2.1309	0.1978	-10.7739	0.0000	-2.5186	-1.7432
age_group35-49	0.4210	0.2573	1.6361	0.1019	-0.0834	0.9253
age_group50-64	1.6359	0.2454	6.6667	0.0000	1.1549	2.1170
age_group65+	1.7003	0.2381	7.1401	0.0000	1.2335	2.1671
income$100K+	-1.0655	0.3800	-2.8037	0.0051	-1.8104	-0.3205
income$25K-$49K	-0.4877	0.2666	-1.8292	0.0674	-1.0103	0.0350
income$50K-$99K	-1.2234	0.2576	-4.7495	0.0000	-1.7283	-0.7185
educationCollege graduate	1.2925	0.3998	3.2330	0.0012	0.5088	2.0762
educationHS/GED	0.8799	0.3926	2.2414	0.0250	0.1104	1.6494
educationSome college	1.0942	0.3966	2.7590	0.0058	0.3168	1.8717
depressionYes	0.4018	0.2101	1.9122	0.0559	-0.0101	0.8137

# Stepwise selection (both)
mod_stepwise <- step(mod_null,
                     scope = list(lower = mod_null, upper = mod_max),
                     direction = "both", trace = 1)

## Start:  AIC=34269.45
## physhlth_days ~ 1
## 
##                 Df Sum of Sq    RSS   AIC
## + gen_health     2    168850 411060 31520
## + menthlth_days  1     66457 513453 33298
## + exercise       1     38219 541691 33726
## + income         3     30450 549460 33844
## + depression     1     21930 557980 33963
## + education      3     10761 569149 34126
## + smoking        2      9915 569995 34135
## + bmi            1      7925 571985 34161
## + age_group      3      7295 572614 34174
## + sex            1      1400 578510 34252
## <none>                       579910 34269
## 
## Step:  AIC=31520.38
## physhlth_days ~ gen_health
## 
##                 Df Sum of Sq    RSS   AIC
## + menthlth_days  1     19271 391789 31138
## + exercise       1      7169 403891 31382
## + depression     1      4595 406465 31432
## + income         3      3091 407969 31466
## + age_group      3      1765 409295 31492
## + smoking        2      1068 409992 31504
## + sex            1       875 410185 31505
## <none>                       411060 31520
## + bmi            1        89 410971 31521
## + education      3       207 410853 31522
## - gen_health     2    168850 579910 34269
## 
## Step:  AIC=31138.25
## physhlth_days ~ gen_health + menthlth_days
## 
##                 Df Sum of Sq    RSS   AIC
## + exercise       1      6765 385023 31001
## + age_group      3      5019 386770 31041
## + income         3      1819 389970 31107
## + sex            1       445 391343 31131
## + smoking        2       447 391342 31133
## + depression     1       228 391560 31136
## <none>                       391789 31138
## + bmi            1        48 391740 31139
## + education      3       137 391651 31141
## - menthlth_days  1     19271 411060 31520
## - gen_health     2    121665 513453 33298
## 
## Step:  AIC=31000.9
## physhlth_days ~ gen_health + menthlth_days + exercise
## 
##                 Df Sum of Sq    RSS   AIC
## + age_group      3      4095 380929 30921
## + income         3      1058 383966 30985
## + sex            1       319 384704 30996
## + depression     1       220 384804 30998
## + smoking        2       296 384727 30999
## + education      3       346 384677 31000
## <none>                       385023 31001
## + bmi            1        12 385011 31003
## - exercise       1      6765 391789 31138
## - menthlth_days  1     18868 403891 31382
## - gen_health     2     99662 484685 32838
## 
## Step:  AIC=30921.36
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
## 
##                 Df Sum of Sq    RSS   AIC
## + income         3       955 379974 30907
## + depression     1       260 380669 30918
## + sex            1       224 380704 30919
## <none>                       380929 30921
## + smoking        2       128 380801 30923
## + education      3       198 380731 30923
## + bmi            1         3 380926 30923
## - age_group      3      4095 385023 31001
## - exercise       1      5841 386770 31041
## - menthlth_days  1     21621 402550 31361
## - gen_health     2     92101 473029 32650
## 
## Step:  AIC=30907.29
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income
## 
##                 Df Sum of Sq    RSS   AIC
## + education      3       603 379371 30901
## + depression     1       226 379748 30905
## + sex            1       161 379813 30906
## <none>                       379974 30907
## + bmi            1         0 379973 30909
## + smoking        2        69 379904 30910
## - income         3       955 380929 30921
## - age_group      3      3992 383966 30985
## - exercise       1      5200 385174 31014
## - menthlth_days  1     20471 400444 31325
## - gen_health     2     84470 464444 32509
## 
## Step:  AIC=30900.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + education
## 
##                 Df Sum of Sq    RSS   AIC
## + depression     1       174 379197 30899
## + sex            1       117 379254 30900
## <none>                       379371 30901
## + smoking        2       140 379230 30902
## + bmi            1         0 379370 30903
## - education      3       603 379974 30907
## - income         3      1360 380731 30923
## - age_group      3      3689 383059 30972
## - exercise       1      5531 384902 31014
## - menthlth_days  1     20150 399521 31313
## - gen_health     2     84954 464325 32513
## 
## Step:  AIC=30898.91
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + education + depression
## 
##                 Df Sum of Sq    RSS   AIC
## <none>                       379197 30899
## + sex            1        86 379111 30899
## + smoking        2       127 379070 30900
## - depression     1       174 379371 30901
## + bmi            1         2 379195 30901
## - education      3       551 379748 30905
## - income         3      1299 380496 30920
## - age_group      3      3726 382923 30971
## - exercise       1      5512 384709 31012
## - menthlth_days  1     16208 395405 31232
## - gen_health     2     83954 463151 32495

tidy(mod_stepwise, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Stepwise Selection Result (AIC-based)",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Stepwise Selection Result (AIC-based)
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	1.5069	0.4596	3.2785	0.0010	0.6059	2.4079
gen_healthFair/Poor	10.0658	0.2458	40.9539	0.0000	9.5840	10.5476
gen_healthGood	1.1949	0.1782	6.7058	0.0000	0.8456	1.5442
menthlth_days	0.2006	0.0109	18.4742	0.0000	0.1793	0.2219
exerciseYes	-2.1309	0.1978	-10.7739	0.0000	-2.5186	-1.7432
age_group35-49	0.4210	0.2573	1.6361	0.1019	-0.0834	0.9253
age_group50-64	1.6359	0.2454	6.6667	0.0000	1.1549	2.1170
age_group65+	1.7003	0.2381	7.1401	0.0000	1.2335	2.1671
income$100K+	-1.0655	0.3800	-2.8037	0.0051	-1.8104	-0.3205
income$25K-$49K	-0.4877	0.2666	-1.8292	0.0674	-1.0103	0.0350
income$50K-$99K	-1.2234	0.2576	-4.7495	0.0000	-1.7283	-0.7185
educationCollege graduate	1.2925	0.3998	3.2330	0.0012	0.5088	2.0762
educationHS/GED	0.8799	0.3926	2.2414	0.0250	0.1104	1.6494
educationSome college	1.0942	0.3966	2.7590	0.0058	0.3168	1.8717
depressionYes	0.4018	0.2101	1.9122	0.0559	-0.0101	0.8137

# Variables retained in each model

attr(terms(mod_backward), "term.labels")

## [1] "menthlth_days" "exercise"      "depression"    "age_group"    
## [5] "income"        "education"     "gen_health"

attr(terms(mod_forward), "term.labels")

## [1] "gen_health"    "menthlth_days" "exercise"      "age_group"    
## [5] "income"        "education"     "depression"

attr(terms(mod_stepwise), "term.labels")

## [1] "gen_health"    "menthlth_days" "exercise"      "age_group"    
## [5] "income"        "education"     "depression"

# Step 4: Choose final model
# Compare all methods in one table
method_comparison <- tribble(
  ~Method, ~`Variables selected`, ~`Adj. R²`, ~AIC, ~BIC,
  "Backward (AIC)",
    length(coef(mod_backward)) - 1,
    round(glance(mod_backward)$adj.r.squared, 4),
    round(AIC(mod_backward), 1),
    round(BIC(mod_backward), 1),
  "Forward (AIC)",
    length(coef(mod_forward)) - 1,
    round(glance(mod_forward)$adj.r.squared, 4),
    round(AIC(mod_forward), 1),
    round(BIC(mod_forward), 1),
  "Stepwise (AIC)",
    length(coef(mod_stepwise)) - 1,
    round(glance(mod_stepwise)$adj.r.squared, 4),
    round(AIC(mod_stepwise), 1),
    round(BIC(mod_stepwise), 1)
)

method_comparison |>
  kable(caption = "Comparison of Automated Variable Selection Methods") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Comparison of Automated Variable Selection Methods
Method	Variables selected	Adj. R²	AIC	BIC
Backward (AIC)	14	0.345	53603.9	53715.7
Forward (AIC)	14	0.345	53603.9	53715.7
Stepwise (AIC)	14	0.345	53603.9	53715.7

# As backward, forward, and stepwise all selected the same model,

mod_final <- mod_stepwise

# Final model summary
summary(mod_final)

## 
## Call:
## lm(formula = physhlth_days ~ gen_health + menthlth_days + exercise + 
##     age_group + income + education + depression, data = brfss_analytic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.9865  -2.6810  -1.1454   0.6303  30.2318 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.50689    0.45962   3.279  0.00105 ** 
## gen_healthFair/Poor       10.06584    0.24578  40.954  < 2e-16 ***
## gen_healthGood             1.19494    0.17819   6.706 2.14e-11 ***
## menthlth_days              0.20064    0.01086  18.474  < 2e-16 ***
## exerciseYes               -2.13087    0.19778 -10.774  < 2e-16 ***
## age_group35-49             0.42096    0.25730   1.636  0.10187    
## age_group50-64             1.63593    0.24539   6.667 2.79e-11 ***
## age_group65+               1.70027    0.23813   7.140 1.01e-12 ***
## income$100K+              -1.06549    0.38003  -2.804  0.00506 ** 
## income$25K-$49K           -0.48766    0.26660  -1.829  0.06741 .  
## income$50K-$99K           -1.22340    0.25759  -4.749 2.08e-06 ***
## educationCollege graduate  1.29249    0.39978   3.233  0.00123 ** 
## educationHS/GED            0.87988    0.39256   2.241  0.02503 *  
## educationSome college      1.09424    0.39661   2.759  0.00581 ** 
## depressionYes              0.40182    0.21013   1.912  0.05588 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.891 on 7985 degrees of freedom
## Multiple R-squared:  0.3461, Adjusted R-squared:  0.345 
## F-statistic: 301.9 on 14 and 7985 DF,  p-value: < 2.2e-16

# Coefficient table with 95% confidence intervals
final_coef_table <- tidy(mod_final, conf.int = TRUE)

final_coef_table |>
  kable(digits = 3, caption = "Coefficient Table for the Final Linear Regression Model") |>
  kable_styling(full_width = FALSE)

Coefficient Table for the Final Linear Regression Model
term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	1.507	0.460	3.279	0.001	0.606	2.408
gen_healthFair/Poor	10.066	0.246	40.954	0.000	9.584	10.548
gen_healthGood	1.195	0.178	6.706	0.000	0.846	1.544
menthlth_days	0.201	0.011	18.474	0.000	0.179	0.222
exerciseYes	-2.131	0.198	-10.774	0.000	-2.519	-1.743
age_group35-49	0.421	0.257	1.636	0.102	-0.083	0.925
age_group50-64	1.636	0.245	6.667	0.000	1.155	2.117
age_group65+	1.700	0.238	7.140	0.000	1.233	2.167
income$100K+	-1.065	0.380	-2.804	0.005	-1.810	-0.321
income$25K-$49K	-0.488	0.267	-1.829	0.067	-1.010	0.035
income$50K-$99K	-1.223	0.258	-4.749	0.000	-1.728	-0.718
educationCollege graduate	1.292	0.400	3.233	0.001	0.509	2.076
educationHS/GED	0.880	0.393	2.241	0.025	0.110	1.649
educationSome college	1.094	0.397	2.759	0.006	0.317	1.872
depressionYes	0.402	0.210	1.912	0.056	-0.010	0.814

# Final model fit statistics
tibble(
  Model = "Final Model",
  R_squared = summary(mod_final)$r.squared,
  Adjusted_R_squared = summary(mod_final)$adj.r.squared,
  AIC = AIC(mod_final),
  BIC = BIC(mod_final)
) |>
  kable(digits = 3, caption = "Fit Statistics for the Final Linear Regression Model") |>
  kable_styling(full_width = FALSE)

Fit Statistics for the Final Linear Regression Model
Model	R_squared	Adjusted_R_squared	AIC	BIC
Final Model	0.346	0.345	53603.93	53715.73

# Show retained variables in final model
attr(terms(mod_final), "term.labels")

## [1] "gen_health"    "menthlth_days" "exercise"      "age_group"    
## [5] "income"        "education"     "depression"

tidy(mod_final, conf.int = TRUE)

## # A tibble: 15 × 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)             1.51     0.460       3.28 1.05e- 3   0.606     2.41  
##  2 gen_healthFair/Poor    10.1      0.246      41.0  0          9.58     10.5   
##  3 gen_healthGood          1.19     0.178       6.71 2.14e-11   0.846     1.54  
##  4 menthlth_days           0.201    0.0109     18.5  1.18e-74   0.179     0.222 
##  5 exerciseYes            -2.13     0.198     -10.8  6.99e-27  -2.52     -1.74  
##  6 age_group35-49          0.421    0.257       1.64 1.02e- 1  -0.0834    0.925 
##  7 age_group50-64          1.64     0.245       6.67 2.79e-11   1.15      2.12  
##  8 age_group65+            1.70     0.238       7.14 1.01e-12   1.23      2.17  
##  9 income$100K+           -1.07     0.380      -2.80 5.06e- 3  -1.81     -0.321 
## 10 income$25K-$49K        -0.488    0.267      -1.83 6.74e- 2  -1.01      0.0350
## 11 income$50K-$99K        -1.22     0.258      -4.75 2.08e- 6  -1.73     -0.718 
## 12 educationCollege gr…    1.29     0.400       3.23 1.23e- 3   0.509     2.08  
## 13 educationHS/GED         0.880    0.393       2.24 2.50e- 2   0.110     1.65  
## 14 educationSome colle…    1.09     0.397       2.76 5.81e- 3   0.317     1.87  
## 15 depressionYes           0.402    0.210       1.91 5.59e- 2  -0.0101    0.814

# Step 5: LINE assumption check
par(mfrow = c(2, 2))
plot(mod_final)

par(mfrow = c(1, 1))

Part 1, Step 1: Maximum Model Interpretation

The maximum linear regression model explained a moderate amount of variation in physically unhealthy days, with an R² of 0.347 and an adjusted R² of 0.345, meaning about 34.5% of the variability in physical health burden was explained by the predictors in the model. The model AIC was 53,606.97 and BIC was 53,746.71. Several predictors were strongly associated with more unhealthy days, especially fair/poor general health (β = 10.06, 95% CI: 9.57, 10.55, p<0.001), good general health (β = 1.19, 95% CI: 0.83, 1.54, p<0.001), and mental unhealthy days (β = 0.20, 95% CI: 0.18, 0.22, p<0.001). Exercise was protective, with those reporting exercise having 2.12 fewer unhealthy days on average than non-exercisers (β = -2.12, p<0.001).

Part 1, Step 2: Best Subsets Interpretation

The best subsets regression showed that the model maximizing adjusted R² included 17 parameters, whereas the model minimizing BIC included only 7 parameters. These criteria did not agree. Adjusted R² favored a larger, more complex model, while BIC favored a simpler model. This is expected because BIC penalizes model complexity more strongly. The BIC-selected model retained mental unhealthy days, exercise, age 50–64, age 65+, income $50K–$99K, and general health indicators, suggesting these were the most important predictors.

Part 1, Step 3: Stepwise Methods Comparison

Backward elimination, forward selection, and stepwise selection all resulted in the same final model with 14 variables retained. All three methods also had identical adjusted R² (0.345), AIC (53,603.9), and BIC (53,715.7), showing strong agreement and model stability. Variables excluded by all methods were BMI, sex, and smoking, while predictors such as mental unhealthy days, exercise, age group, income, education, and general health were retained across all methods. Because all methods agreed, there was strong evidence that the selected predictors were robust.

Part 1, Step 4: Final Model Interpretation

Because all three automated selection methods produced the same final model, I selected this model as the final model. Holding all other variables constant, each additional mentally unhealthy day was associated with 0.20 more physically unhealthy days (β = 0.2006, p<0.001). Adults who exercised had 2.13 fewer unhealthy days than those who did not exercise (β = -2.1309, p<0.001). Compared with adults reporting excellent or very good health, those reporting fair/poor health had 10.07 more unhealthy days on average (β = 10.07, p<0.001), indicating general health was the strongest predictor.

Part 1, Step 5: LINE Assumptions

The Residuals vs Fitted plot suggested the linearity assumption was reasonably satisfied, with only mild evidence of non-linearity. The Normal Q-Q plot showed residuals were approximately normal, although some deviation was observed in the tails. The Scale-Location plot suggested roughly constant variance, with only minor heteroscedasticity. The Residuals vs Leverage plot did not suggest major influential observations. Overall, the LINE assumptions appeared reasonably satisfied, with only minor deviations that were not severe enough to invalidate the model.

Part 2: Logistic Regression

# Step 1 Create binary outcome

brfss_analytic <- brfss_analytic |>
  mutate(
    frequent_distress = factor(
      ifelse(physhlth_days >= 14, "Yes", "No"),
      levels = c("No", "Yes")
    )
  )

# Prevalence: counts
table(brfss_analytic$frequent_distress)

## 
##   No  Yes 
## 6927 1073

# Prevalence: percentages
prop.table(table(brfss_analytic$frequent_distress)) * 100

## 
##      No     Yes 
## 86.5875 13.4125

# Formatted prevalence table
distress_prev <- brfss_analytic |>
  count(frequent_distress) |>
  mutate(
    Percent = round(100 * n / sum(n), 1)
  )

distress_prev |>
  kable(
    caption = "Prevalence of Frequent Physical Distress",
    col.names = c("Frequent Distress", "Count", "Percent")
  ) |>
  kable_styling(full_width = FALSE)

Prevalence of Frequent Physical Distress
Frequent Distress	Count	Percent
No	6927	86.6
Yes	1073	13.4

# Step 2: Simple (Unadjusted) Logistic Regression

# Unadjusted model using general health
mod_simple <- glm(
  frequent_distress ~ gen_health,
  data = brfss_analytic,
  family = binomial
)

# Model summary
summary(mod_simple)

## 
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial, 
##     data = brfss_analytic)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.34421    0.08725 -38.329   <2e-16 ***
## gen_healthFair/Poor  3.33700    0.10245  32.571   <2e-16 ***
## gen_healthGood       1.07595    0.10999   9.782   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6306.5  on 7999  degrees of freedom
## Residual deviance: 4741.8  on 7997  degrees of freedom
## AIC: 4747.8
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios
exp(coef(mod_simple))

##         (Intercept) gen_healthFair/Poor      gen_healthGood 
##          0.03528801         28.13465602          2.93277488

# 95% CI for ORs
exp(confint(mod_simple))

## Waiting for profiling to be done...

##                           2.5 %      97.5 %
## (Intercept)          0.02960235  0.04168464
## gen_healthFair/Poor 23.08938016 34.50941676
## gen_healthGood       2.36832877  3.64628926

# Formatted OR table
simple_or <- tidy(
  mod_simple,
  exponentiate = TRUE,
  conf.int = TRUE
)

simple_or |>
  kable(
    digits = 3,
    caption = "Unadjusted Odds Ratios for General Health and Frequent Physical Distress"
  ) |>
  kable_styling(full_width = FALSE)

Unadjusted Odds Ratios for General Health and Frequent Physical Distress
term	estimate	std.error	statistic	conf.low	conf.high
(Intercept)	0.035	0.087	-38.329	0.030	0.042
gen_healthFair/Poor	28.135	0.102	32.571	23.089	34.509
gen_healthGood	2.933	0.110	9.782	2.368	3.646

#Publication-Ready Table

mod_simple |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(gen_health ~ "General Health in past 30 days")
  ) |>
  bold_labels() |>
  bold_p()

Characteristic	OR	95% CI	p-value
General Health in past 30 days
Excellent/Very Good	—	—
Fair/Poor	28.1	23.1, 34.5	<0.001
Good	2.93	2.37, 3.65	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

# Wald p-values (for significance)
summary(mod_simple)$coefficients

##                      Estimate Std. Error    z value      Pr(>|z|)
## (Intercept)         -3.344212 0.08724909 -38.329475 1.976263e-321
## gen_healthFair/Poor  3.337002 0.10245176  32.571446 1.040833e-232
## gen_healthGood       1.075949 0.10999156   9.782105  1.343856e-22

# Step 3: Multiple Logistic Regression

mod_logistic <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health,
  data = brfss_analytic,
  family = binomial
)

# Model summary
summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + sex + 
##     exercise + depression + age_group + income + education + 
##     smoking + gen_health, family = binomial, data = brfss_analytic)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.4819971  0.2754154 -12.643  < 2e-16 ***
## menthlth_days              0.0531123  0.0043127  12.315  < 2e-16 ***
## bmi                        0.0007573  0.0054204   0.140   0.8889    
## sexFemale                  0.1146198  0.0815481   1.406   0.1599    
## exerciseYes               -0.7243823  0.0849653  -8.526  < 2e-16 ***
## depressionYes              0.0487396  0.0981422   0.497   0.6195    
## age_group35-49             0.1833795  0.1561646   1.174   0.2403    
## age_group50-64             0.7804061  0.1425411   5.475 4.38e-08 ***
## age_group65+               0.7600142  0.1418878   5.356 8.49e-08 ***
## income$100K+              -0.4860535  0.2285625  -2.127   0.0335 *  
## income$25K-$49K           -0.0673544  0.1116534  -0.603   0.5463    
## income$50K-$99K           -0.4553597  0.1145283  -3.976 7.01e-05 ***
## educationCollege graduate  0.2555872  0.1745785   1.464   0.1432    
## educationHS/GED            0.1958283  0.1637161   1.196   0.2316    
## educationSome college      0.1311359  0.1675926   0.782   0.4339    
## smokingCurrent             0.0406113  0.1209912   0.336   0.7371    
## smokingFormer              0.1035453  0.0915102   1.132   0.2578    
## gen_healthFair/Poor        2.6478096  0.1136015  23.308  < 2e-16 ***
## gen_healthGood             0.7880321  0.1147087   6.870 6.43e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6306.5  on 7999  degrees of freedom
## Residual deviance: 4377.9  on 7981  degrees of freedom
## AIC: 4415.9
## 
## Number of Fisher Scoring iterations: 6

mod_logistic |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      menthlth_days ~ "Menatal Unhealth days (past 30 days)",
      bmi           ~ "BMI",
      exercise   ~ "Exercise (past 30 days)",
      depression ~ "Depression",
      education  ~ "Education Level",
      sex        ~ "Sex",
      age_group  ~ "Age group",
      smoking     ~ "Smoking status",
      income     ~ "Annual Household income",
      gen_health ~ "General Healath Status"
    )
  ) |>
  bold_labels() |>
  bold_p()

Characteristic	OR	95% CI	p-value
Menatal Unhealth days (past 30 days)	1.05	1.05, 1.06	<0.001
BMI	1.00	0.99, 1.01	0.9
Sex
Male	—	—
Female	1.12	0.96, 1.32	0.2
Exercise (past 30 days)
No	—	—
Yes	0.48	0.41, 0.57	<0.001
Depression
No	—	—
Yes	1.05	0.87, 1.27	0.6
Age group
18-34	—	—
35-49	1.20	0.89, 1.63	0.2
50-64	2.18	1.66, 2.90	<0.001
65+	2.14	1.63, 2.84	<0.001
Annual Household income
Less than $25K	—	—
$100K+	0.62	0.39, 0.95	0.033
$25K-$49K	0.93	0.75, 1.16	0.5
$50K-$99K	0.63	0.51, 0.79	<0.001
Education Level
Less than HS	—	—
College graduate	1.29	0.92, 1.82	0.14
HS/GED	1.22	0.88, 1.68	0.2
Some college	1.14	0.82, 1.59	0.4
Smoking status
Never	—	—
Current	1.04	0.82, 1.32	0.7
Former	1.11	0.93, 1.33	0.3
General Healath Status
Excellent/Very Good	—	—
Fair/Poor	14.1	11.3, 17.7	<0.001
Good	2.20	1.76, 2.76	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

# Optional fit statistics
AIC(mod_logistic)

## [1] 4415.938

# Step 4: Likelihood Ratio Test
# Testing income as a group

# Reduced model without income
mod_reduced <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + education + smoking + gen_health,
  data = brfss_analytic,
  family = binomial
)

# Likelihood ratio test
lrt_income <- anova(mod_reduced, mod_logistic, test = "Chisq")
lrt_income

## Analysis of Deviance Table
## 
## Model 1: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + education + smoking + gen_health
## Model 2: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7984     4400.2                          
## 2      7981     4377.9  3   22.235 5.828e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Formatted table
as.data.frame(lrt_income) |>
  tibble::rownames_to_column("Model") |>
  kable(
    digits = 3,
    caption = "Likelihood Ratio Test Comparing Reduced Model and Full Model"
  ) |>
  kable_styling(full_width = FALSE)

Likelihood Ratio Test Comparing Reduced Model and Full Model
Model	Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
1	7984	4400.173	NA	NA	NA
2	7981	4377.938	3	22.235	0

# Step 5: ROC Curve and AUC

roc_obj <- roc(
  brfss_analytic$frequent_distress,
  fitted(mod_logistic)
)

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

# ROC curve
plot(
  roc_obj,
  main = "ROC Curve: Frequent Physical Distress Model",
  print.auc = TRUE
)

# AUC
auc(roc_obj)

## Area under the curve: 0.856

# 95% CI for AUC
ci.auc(roc_obj)

## 95% CI: 0.8428-0.8692 (DeLong)

# Optional tidy display
auc_est <- as.numeric(auc(roc_obj))
auc_ci <- ci.auc(roc_obj)

tibble(
  AUC = auc_est,
  Lower_CI = auc_ci[1],
  Upper_CI = auc_ci[3]
) |>
  kable(
    digits = 3,
    caption = "Area Under the ROC Curve (AUC) with 95% Confidence Interval"
  ) |>
  kable_styling(full_width = FALSE)

Area Under the ROC Curve (AUC) with 95% Confidence Interval
AUC	Lower_CI	Upper_CI
0.856	0.843	0.869

# Step 6: Hosmer-Lemeshow Test

hl_test <- hoslem.test(
  mod_logistic$y,
  fitted(mod_logistic),
  g = 10
)

hl_test

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 15.776, df = 8, p-value = 0.0457

# Formatted summary table
tibble(
  Statistic = as.numeric(hl_test$statistic),
  DF = as.numeric(hl_test$parameter),
  P_value = hl_test$p.value
) |>
  kable(
    digits = 3,
    caption = "Hosmer-Lemeshow Goodness-of-Fit Test"
  ) |>
  kable_styling(full_width = FALSE)

Hosmer-Lemeshow Goodness-of-Fit Test
Statistic	DF	P_value
15.776	8	0.046

Part 2, Step 1: Outcome Balance

The binary outcome frequent physical distress showed a reasonable distribution between cases and non-cases, suggesting the outcome was not extremely imbalanced. This is helpful for logistic regression because severe imbalance can affect model performance and classification accuracy.

Part 2, Step 2: Simple Logistic Regression

General health was chosen for the simple logistic regression because it showed the strongest association in the linear model. Compared with adults reporting excellent or very good health, those reporting fair/poor health had substantially higher odds of frequent physical distress, with a highly statistically significant association (p<0.001). Because the confidence interval excluded 1.0 and the p-value was well below 0.05, the association was statistically significant.

Part 2, Step 2: Simple Logistic Regression

Part 2, Step 4: Likelihood Ratio Test

The likelihood ratio test examined whether income as a group improved model fit. The deviance difference was 22.235 with 3 degrees of freedom (p<0.001), indicating a statistically significant improvement in model fit when income was included. Therefore, I rejected the null hypothesis and concluded that income contributed significantly to explaining frequent physical distress.

Part 2, Step 5. ROC/AUC

The logistic model had an AUC of 0.856 (95% CI: 0.843, 0.869), indicating excellent discrimination. This means the model performed well in distinguishing individuals with and without frequent physical distress. Because the AUC was between 0.8 and 0.9, the model had excellent predictive performance.

Part 2, Step 6: Hosmer-Lemeshow

The Hosmer-Lemeshow test produced χ² = 15.776 with 8 degrees of freedom and p = 0.046. Because the p-value was slightly below 0.05, there was some evidence of imperfect model fit. However, this result should be interpreted alongside the strong ROC performance (AUC = 0.856), which suggests that although calibration may not be perfect, the model still discriminates well. Together, these findings suggest good predictive performance with minor fit limitations.

Part 3: Reflection

A. Statistical Insights

The results suggest that physical health burden among U.S. adults is associated with mental health, exercise, age, income, and especially general health status. In both the linear and logistic models, general health was the strongest predictor. Individuals reporting fair or poor health had substantially more physically unhealthy days in the linear model and much higher odds of frequent physical distress in the logistic model. Mental unhealthy days were also consistently associated with worse physical health outcomes, while exercise appeared protective in both models. Some predictors, such as education, showed significance in the linear model but were not statistically significant in the adjusted logistic model, suggesting some predictors may influence the number of unhealthy days but not necessarily crossing the threshold of frequent distress. A major limitation of this analysis is that BRFSS data are cross-sectional, so associations can be identified but causality cannot be established. In addition, some potential confounders were not measured in the model, including access to healthcare, chronic disease burden, social support, and medication use.

B. Linear versus Logistic Regression

The linear regression model and logistic regression model provided related but different information about the same research question. The linear model estimated how predictors were associated with the number of physically unhealthy days, which provides information about changes in burden on a continuous scale. In contrast, the logistic model estimated odds of experiencing frequent physical distress, which is useful for identifying risk of a clinically meaningful threshold. Linear regression is helpful when interest lies in average changes in outcome levels, while logistic regression is more useful when the outcome is binary or when risk classification is important. The model evaluation criteria also differed across the two approaches. Linear regression relied on R², adjusted R², AIC, and BIC to compare models, whereas logistic regression used AUC and the Hosmer-Lemeshow test to evaluate discrimination and calibration.

C. AI Transparency

I used AI assistance primarily for debugging code and clarifying interpretation after first attempting the analyses independently. I was confused about the sleep variable but when I asked AI it clarify me about the unavailability of the variable on the dataset. I also asked questions to AI for different codeing steps that were present in the lecture to clarify me about why a specific code is being applied and how its working.

Computer Homework 4: Model Selection and Introduction to Logistic Regression

Tahia Sufyani

2026-04-24

Part 0: Data Preparation

Part 1: Model Selection

Part 2: Logistic Regression

Part 3: Reflection