Loading necessary packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
library(broom)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(gtsummary)
library(leaps)
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.5.3
## ResourceSelection 0.3-6 2023-06-27
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
## Setting theme "Compact"
## Setting theme "Compact"
brfss <- read_xpt("C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/New folder/LLCP2023XPT/LLCP2023.XPT")
dim(brfss)
## [1] 433323 350
names(brfss)
## [1] "_STATE" "FMONTH" "IDATE" "IMONTH" "IDAY" "IYEAR"
## [7] "DISPCODE" "SEQNO" "_PSU" "CTELENM1" "PVTRESD1" "COLGHOUS"
## [13] "STATERE1" "CELPHON1" "LADULT1" "NUMADULT" "RESPSLC1" "LANDSEX2"
## [19] "LNDSXBRT" "SAFETIME" "CTELNUM1" "CELLFON5" "CADULT1" "CELLSEX2"
## [25] "CELSXBRT" "PVTRESD3" "CCLGHOUS" "CSTATE1" "LANDLINE" "HHADULT"
## [31] "SEXVAR" "GENHLTH" "PHYSHLTH" "MENTHLTH" "POORHLTH" "PRIMINS1"
## [37] "PERSDOC3" "MEDCOST1" "CHECKUP1" "EXERANY2" "EXRACT12" "EXEROFT1"
## [43] "EXERHMM1" "EXRACT22" "EXEROFT2" "EXERHMM2" "STRENGTH" "BPHIGH6"
## [49] "BPMEDS1" "CHOLCHK3" "TOLDHI3" "CHOLMED3" "CVDINFR4" "CVDCRHD4"
## [55] "CVDSTRK3" "ASTHMA3" "ASTHNOW" "CHCSCNC1" "CHCOCNC1" "CHCCOPD3"
## [61] "ADDEPEV3" "CHCKDNY2" "HAVARTH4" "DIABETE4" "DIABAGE4" "MARITAL"
## [67] "EDUCA" "RENTHOM1" "NUMHHOL4" "NUMPHON4" "CPDEMO1C" "VETERAN3"
## [73] "EMPLOY1" "CHILDREN" "INCOME3" "PREGNANT" "WEIGHT2" "HEIGHT3"
## [79] "DEAF" "BLIND" "DECIDE" "DIFFWALK" "DIFFDRES" "DIFFALON"
## [85] "FALL12MN" "FALLINJ5" "SMOKE100" "SMOKDAY2" "USENOW3" "ECIGNOW2"
## [91] "ALCDAY4" "AVEDRNK3" "DRNK3GE5" "MAXDRNKS" "FLUSHOT7" "FLSHTMY3"
## [97] "PNEUVAC4" "SHINGLE2" "HIVTST7" "HIVTSTD3" "SEATBELT" "DRNKDRI2"
## [103] "COVIDPO1" "COVIDSM1" "COVIDACT" "PDIABTS1" "PREDIAB2" "DIABTYPE"
## [109] "INSULIN1" "CHKHEMO3" "EYEEXAM1" "DIABEYE1" "DIABEDU1" "FEETSORE"
## [115] "ARTHEXER" "ARTHEDU" "LMTJOIN3" "ARTHDIS2" "JOINPAI2" "LCSFIRST"
## [121] "LCSLAST" "LCSNUMCG" "LCSCTSC1" "LCSSCNCR" "LCSCTWHN" "HADMAM"
## [127] "HOWLONG" "CERVSCRN" "CRVCLCNC" "CRVCLPAP" "CRVCLHPV" "HADHYST2"
## [133] "PSATEST1" "PSATIME1" "PCPSARS2" "PSASUGS1" "PCSTALK2" "HADSIGM4"
## [139] "COLNSIGM" "COLNTES1" "SIGMTES1" "LASTSIG4" "COLNCNCR" "VIRCOLO1"
## [145] "VCLNTES2" "SMALSTOL" "STOLTEST" "STOOLDN2" "BLDSTFIT" "SDNATES1"
## [151] "CNCRDIFF" "CNCRAGE" "CNCRTYP2" "CSRVTRT3" "CSRVDOC1" "CSRVSUM"
## [157] "CSRVRTRN" "CSRVINST" "CSRVINSR" "CSRVDEIN" "CSRVCLIN" "CSRVPAIN"
## [163] "CSRVCTL2" "INDORTAN" "NUMBURN3" "SUNPRTCT" "WKDAYOUT" "WKENDOUT"
## [169] "CIMEMLO1" "CDWORRY" "CDDISCU1" "CDHOUS1" "CDSOCIA1" "CAREGIV1"
## [175] "CRGVREL4" "CRGVLNG1" "CRGVHRS1" "CRGVPRB3" "CRGVALZD" "CRGVPER1"
## [181] "CRGVHOU1" "CRGVEXPT" "LASTSMK2" "STOPSMK2" "MENTCIGS" "MENTECIG"
## [187] "HEATTBCO" "FIREARM5" "GUNLOAD" "LOADULK2" "HASYMP1" "HASYMP2"
## [193] "HASYMP3" "HASYMP4" "HASYMP5" "HASYMP6" "STRSYMP1" "STRSYMP2"
## [199] "STRSYMP3" "STRSYMP4" "STRSYMP5" "STRSYMP6" "FIRSTAID" "ASPIRIN"
## [205] "BIRTHSEX" "SOMALE" "SOFEMALE" "TRNSGNDR" "MARIJAN1" "MARJSMOK"
## [211] "MARJEAT" "MARJVAPE" "MARJDAB" "MARJOTHR" "USEMRJN4" "ACEDEPRS"
## [217] "ACEDRINK" "ACEDRUGS" "ACEPRISN" "ACEDIVRC" "ACEPUNCH" "ACEHURT1"
## [223] "ACESWEAR" "ACETOUCH" "ACETTHEM" "ACEHVSEX" "ACEADSAF" "ACEADNED"
## [229] "IMFVPLA4" "HPVADVC4" "HPVADSHT" "TETANUS1" "COVIDVA1" "COVACGE1"
## [235] "COVIDNU2" "LSATISFY" "EMTSUPRT" "SDLONELY" "SDHEMPLY" "FOODSTMP"
## [241] "SDHFOOD1" "SDHBILLS" "SDHUTILS" "SDHTRNSP" "SDHSTRE1" "RRCLASS3"
## [247] "RRCOGNT2" "RRTREAT" "RRATWRK2" "RRHCARE4" "RRPHYSM2" "RCSGEND1"
## [253] "RCSXBRTH" "RCSRLTN2" "CASTHDX2" "CASTHNO2" "QSTVER" "QSTLANG"
## [259] "_METSTAT" "_URBSTAT" "MSCODE" "_STSTR" "_STRWT" "_RAWRAKE"
## [265] "_WT2RAKE" "_IMPRACE" "_CHISPNC" "_CRACE1" "CAGEG" "_CLLCPWT"
## [271] "_DUALUSE" "_DUALCOR" "_LLCPWT2" "_LLCPWT" "_RFHLTH" "_PHYS14D"
## [277] "_MENT14D" "_HLTHPL1" "_HCVU653" "_TOTINDA" "METVL12_" "METVL22_"
## [283] "MAXVO21_" "FC601_" "ACTIN13_" "ACTIN23_" "PADUR1_" "PADUR2_"
## [289] "PAFREQ1_" "PAFREQ2_" "_MINAC12" "_MINAC22" "STRFREQ_" "PAMISS3_"
## [295] "PAMIN13_" "PAMIN23_" "PA3MIN_" "PAVIG13_" "PAVIG23_" "PA3VIGM_"
## [301] "_PACAT3" "_PAINDX3" "_PA150R4" "_PA300R4" "_PA30023" "_PASTRNG"
## [307] "_PAREC3" "_PASTAE3" "_RFHYPE6" "_CHOLCH3" "_RFCHOL3" "_MICHD"
## [313] "_LTASTH1" "_CASTHM1" "_ASTHMS1" "_DRDXAR2" "_MRACE1" "_HISPANC"
## [319] "_RACE" "_RACEG21" "_RACEGR3" "_RACEPRV" "_SEX" "_AGEG5YR"
## [325] "_AGE65YR" "_AGE80" "_AGE_G" "HTIN4" "HTM4" "WTKG3"
## [331] "_BMI5" "_BMI5CAT" "_RFBMI5" "_CHLDCNT" "_EDUCAG" "_INCOMG1"
## [337] "_SMOKER3" "_RFSMOK3" "_CURECI2" "DRNKANY6" "DROCDY4_" "_RFBING6"
## [343] "_DRNKWK2" "_RFDRHV8" "_FLSHOT7" "_PNEUMO3" "_AIDTST4" "_RFSEAT2"
## [349] "_RFSEAT3" "_DRNKDRV"
Recoding the new dataset
brfss_clean <- brfss |>
select(
PHYSHLTH, MENTHLTH, `_BMI5`, SEXVAR, EXERANY2,
ADDEPEV3, `_AGEG5YR`, `_INCOMG1`, EDUCA, `_SMOKER3`, GENHLTH
) |>
mutate(
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH %in% c(77, 99) ~ NA_real_,
TRUE ~ as.numeric(PHYSHLTH)
),
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH %in% c(77, 99) ~ NA_real_,
TRUE ~ as.numeric(MENTHLTH)
),
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_,
TRUE ~ `_BMI5` / 100
),
sex = factor(SEXVAR, levels = c(1, 2), labels = c("Male", "Female")),
exercise = factor(EXERANY2, levels = c(2, 1), labels = c("No", "Yes")),
depression = factor(ADDEPEV3, levels = c(2, 1), labels = c("No", "Yes")),
age_group = factor(case_when(
`_AGEG5YR` %in% 1:3 ~ "18-34",
`_AGEG5YR` %in% 4:6 ~ "35-49",
`_AGEG5YR` %in% 7:9 ~ "50-64",
`_AGEG5YR` %in% 10:13 ~ "65+",
TRUE ~ NA_character_
)),
income = factor(case_when(
`_INCOMG1` %in% 1:2 ~ "Less than $25K",
`_INCOMG1` %in% 3:4 ~ "$25K-$49K",
`_INCOMG1` %in% 5:6 ~ "$50K-$99K",
`_INCOMG1` == 7 ~ "$100K+",
TRUE ~ NA_character_
)),
education = factor(case_when(
EDUCA %in% 1:3 ~ "Less than HS",
EDUCA == 4 ~ "HS/GED",
EDUCA == 5 ~ "Some college",
EDUCA == 6 ~ "College graduate",
TRUE ~ NA_character_
)),
smoking = factor(case_when(
`_SMOKER3` %in% 1:2 ~ "Current",
`_SMOKER3` == 3 ~ "Former",
`_SMOKER3` == 4 ~ "Never",
TRUE ~ NA_character_
)),
gen_health = factor(case_when(
GENHLTH %in% 1:2 ~ "Excellent/Very Good",
GENHLTH == 3 ~ "Good",
GENHLTH %in% 4:5 ~ "Fair/Poor",
TRUE ~ NA_character_
))
) |>
select(
physhlth_days, menthlth_days, bmi, sex, exercise,
depression, age_group, income, education, smoking, gen_health
)
Setting Reference Level
brfss_clean <- brfss_clean |>
mutate(
sex = relevel(sex, ref = "Male"),
exercise = relevel(exercise, ref = "No"),
depression = relevel(depression, ref = "No"),
age_group = relevel(age_group, ref = "18-34"),
income = relevel(income, ref = "Less than $25K"),
education = relevel(education, ref = "Less than HS"),
smoking = relevel(smoking, ref = "Never"),
gen_health = relevel(gen_health, ref = "Excellent/Very Good")
)
Missing data handing
# Missing summary
summary(is.na(brfss_clean$physhlth_days))
## Mode FALSE TRUE
## logical 422538 10785
# Final dataset
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na() |>
slice_sample(n = 8000)
nrow(brfss_analytic)
## [1] 8000
tbl_summary(brfss_analytic)
| Characteristic | N = 8,0001 |
|---|---|
| physhlth_days | 0 (0, 4) |
| menthlth_days | 0 (0, 5) |
| bmi | 27 (24, 32) |
| sex | |
| Male | 3,905 (49%) |
| Female | 4,095 (51%) |
| exercise | 6,224 (78%) |
| depression | 1,694 (21%) |
| age_group | |
| 18-34 | 1,328 (17%) |
| 35-49 | 1,650 (21%) |
| 50-64 | 2,145 (27%) |
| 65+ | 2,877 (36%) |
| income | |
| Less than $25K | 1,101 (14%) |
| $100K+ | 631 (7.9%) |
| $25K-$49K | 1,927 (24%) |
| $50K-$99K | 4,341 (54%) |
| education | |
| Less than HS | 384 (4.8%) |
| College graduate | 3,608 (45%) |
| HS/GED | 1,941 (24%) |
| Some college | 2,067 (26%) |
| smoking | |
| Never | 4,884 (61%) |
| Current | 855 (11%) |
| Former | 2,261 (28%) |
| gen_health | |
| Excellent/Very Good | 3,990 (50%) |
| Fair/Poor | 1,387 (17%) |
| Good | 2,623 (33%) |
| 1 Median (Q1, Q3); n (%) | |
# Step 1: Maximum model
mod_max <- lm(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health,
data = brfss_analytic
)
tidy(mod_max, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Maximum Model: All Candidate Predictors",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.3194 | 0.5781 | 2.2825 | 0.0225 | 0.1863 | 2.4526 |
| menthlth_days | 0.1991 | 0.0109 | 18.2713 | 0.0000 | 0.1777 | 0.2204 |
| bmi | -0.0010 | 0.0123 | -0.0811 | 0.9354 | -0.0250 | 0.0230 |
| sexFemale | 0.2382 | 0.1583 | 1.5049 | 0.1324 | -0.0721 | 0.5484 |
| exerciseYes | -2.1206 | 0.1996 | -10.6241 | 0.0000 | -2.5118 | -1.7293 |
| depressionYes | 0.3477 | 0.2126 | 1.6351 | 0.1021 | -0.0691 | 0.7645 |
| age_group35-49 | 0.3536 | 0.2601 | 1.3591 | 0.1741 | -0.1564 | 0.8635 |
| age_group50-64 | 1.5652 | 0.2484 | 6.3017 | 0.0000 | 1.0783 | 2.0521 |
| age_group65+ | 1.6307 | 0.2425 | 6.7239 | 0.0000 | 1.1553 | 2.1062 |
| income$100K+ | -0.9897 | 0.3826 | -2.5870 | 0.0097 | -1.7397 | -0.2398 |
| income$25K-$49K | -0.4766 | 0.2670 | -1.7850 | 0.0743 | -1.0000 | 0.0468 |
| income$50K-$99K | -1.1806 | 0.2594 | -4.5508 | 0.0000 | -1.6891 | -0.6720 |
| educationCollege graduate | 1.3387 | 0.4028 | 3.3236 | 0.0009 | 0.5491 | 2.1282 |
| educationHS/GED | 0.8888 | 0.3931 | 2.2609 | 0.0238 | 0.1182 | 1.6594 |
| educationSome college | 1.1012 | 0.3978 | 2.7682 | 0.0056 | 0.3214 | 1.8811 |
| smokingCurrent | 0.3991 | 0.2697 | 1.4794 | 0.1391 | -0.1297 | 0.9278 |
| smokingFormer | 0.2341 | 0.1818 | 1.2874 | 0.1980 | -0.1223 | 0.5905 |
| gen_healthFair/Poor | 10.0607 | 0.2495 | 40.3167 | 0.0000 | 9.5715 | 10.5499 |
| gen_healthGood | 1.1876 | 0.1810 | 6.5628 | 0.0000 | 0.8328 | 1.5423 |
# Model fit statistics
glance(mod_max) |>
dplyr::select(r.squared, adj.r.squared, sigma, AIC, BIC, df.residual) |>
mutate(across(everything(), \(x) round(x, 3))) |>
kable(caption = "Maximum Model: Fit Statistics") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| r.squared | adj.r.squared | sigma | AIC | BIC | df.residual |
|---|---|---|---|---|---|
| 0.347 | 0.345 | 6.891 | 53606.97 | 53746.71 | 7981 |
# Step 2: Best subsets regression
best_subsets <- regsubsets(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health,
data = brfss_analytic,
nvmax = ncol(model.matrix(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health,
data = brfss_analytic
)) - 1,
method = "exhaustive"
)
best_summary <- summary(best_subsets)
subset_metrics <- tibble(
p = 1:length(best_summary$adjr2),
`Adj. R²` = best_summary$adjr2,
BIC = best_summary$bic,
Cp = best_summary$cp
)
# Adjusted R2 plot
p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.max(best_summary$adjr2),
linetype = "dashed", color = "tomato") +
labs(
title = "Adjusted R² by Model Size",
x = "Number of Predictors",
y = "Adjusted R²"
) +
theme_minimal(base_size = 12)
# BIC Plot
p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.min(best_summary$bic),
linetype = "dashed", color = "tomato") +
labs(
title = "BIC by Model Size",
x = "Number of Predictors",
y = "BIC"
) +
theme_minimal(base_size = 12)
# Both plots side-by-side
gridExtra::grid.arrange(p1, p2, ncol = 2)
which.max(best_summary$adjr2)
## [1] 17
which.min(best_summary$bic)
## [1] 7
coef(best_subsets, which.max(best_summary$adjr2))
## (Intercept) menthlth_days sexFemale
## 1.2922232 0.1990913 0.2384094
## exerciseYes depressionYes age_group35-49
## -2.1184627 0.3464440 0.3520047
## age_group50-64 age_group65+ income$100K+
## 1.5637608 1.6310257 -0.9895508
## income$25K-$49K income$50K-$99K educationCollege graduate
## -0.4772550 -1.1814628 1.3387368
## educationHS/GED educationSome college smokingCurrent
## 0.8880462 1.1002997 0.4010414
## smokingFormer gen_healthFair/Poor gen_healthGood
## 0.2341107 10.0574585 1.1851928
coef(best_subsets, which.min(best_summary$bic))
## (Intercept) menthlth_days exerciseYes age_group50-64
## 2.2684693 0.2104121 -2.0985245 1.4139167
## age_group65+ income$50K-$99K gen_healthFair/Poor gen_healthGood
## 1.5365529 -0.6232997 10.1107030 1.2085543
cat("Best model by Adj. R²:", which.max(best_summary$adjr2), "variables\n")
## Best model by Adj. R²: 17 variables
cat("Best model by BIC:", which.min(best_summary$bic), "variables\n")
## Best model by BIC: 7 variables
best_bic_idx <- which.min(best_summary$bic)
best_vars <- names(which(best_summary$which[best_bic_idx, -1]))
cat("\nVariables in BIC-best model:\n")
##
## Variables in BIC-best model:
cat(paste(" ", best_vars), sep = "\n")
## menthlth_days
## exerciseYes
## age_group50-64
## age_group65+
## income$50K-$99K
## gen_healthFair/Poor
## gen_healthGood
# Step 3: Stepwise selection methods
# Backward elimination
mod_back <- mod_max
cat("Step 1: Maximum model\n")
## Step 1: Maximum model
cat("Variables:", paste(names(coef(mod_back))[-1], collapse = ", "), "\n")
## Variables: menthlth_days, bmi, sexFemale, exerciseYes, depressionYes, age_group35-49, age_group50-64, age_group65+, income$100K+, income$25K-$49K, income$50K-$99K, educationCollege graduate, educationHS/GED, educationSome college, smokingCurrent, smokingFormer, gen_healthFair/Poor, gen_healthGood
pvals <- tidy(mod_back) |>
filter(term != "(Intercept)") |>
arrange(desc(p.value)) |>
dplyr::select(term, estimate, p.value) |>
mutate(across(where(is.numeric), \(x) round(x, 4)))
pvals |>
head(5) |>
kable(caption = "Maximum Model: Variables Sorted by p-value (Highest First)") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| term | estimate | p.value |
|---|---|---|
| bmi | -0.0010 | 0.9354 |
| smokingFormer | 0.2341 | 0.1980 |
| age_group35-49 | 0.3536 | 0.1741 |
| smokingCurrent | 0.3991 | 0.1391 |
| sexFemale | 0.2382 | 0.1324 |
mod_backward <- step(mod_max, direction = "backward", trace = 1)
## Start: AIC=30901.95
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
##
## Df Sum of Sq RSS AIC
## - bmi 1 0 378962 30900
## - smoking 2 147 379109 30901
## <none> 378962 30902
## - sex 1 108 379069 30902
## - depression 1 127 379089 30903
## - education 3 585 379547 30908
## - income 3 1184 380145 30921
## - age_group 3 3448 382410 30968
## - exercise 1 5360 384321 31012
## - menthlth_days 1 15852 394814 31228
## - gen_health 2 82226 461188 32469
##
## Step: AIC=30899.96
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
##
## Df Sum of Sq RSS AIC
## - smoking 2 149 379111 30899
## <none> 378962 30900
## - sex 1 108 379070 30900
## - depression 1 127 379089 30901
## - education 3 586 379548 30906
## - income 3 1187 380149 30919
## - age_group 3 3451 382413 30966
## - exercise 1 5440 384403 31012
## - menthlth_days 1 15856 394818 31226
## - gen_health 2 83608 462570 32491
##
## Step: AIC=30899.1
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + gen_health
##
## Df Sum of Sq RSS AIC
## - sex 1 86 379197 30899
## <none> 379111 30899
## - depression 1 143 379254 30900
## - education 3 518 379629 30904
## - income 3 1226 380337 30919
## - age_group 3 3677 382788 30970
## - exercise 1 5462 384573 31012
## - menthlth_days 1 16186 395297 31232
## - gen_health 2 84040 463151 32497
##
## Step: AIC=30898.91
## physhlth_days ~ menthlth_days + exercise + depression + age_group +
## income + education + gen_health
##
## Df Sum of Sq RSS AIC
## <none> 379197 30899
## - depression 1 174 379371 30901
## - education 3 551 379748 30905
## - income 3 1299 380496 30920
## - age_group 3 3726 382923 30971
## - exercise 1 5512 384709 31012
## - menthlth_days 1 16208 395405 31232
## - gen_health 2 83954 463151 32495
tidy(mod_backward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Backward Elimination Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.5069 | 0.4596 | 3.2785 | 0.0010 | 0.6059 | 2.4079 |
| menthlth_days | 0.2006 | 0.0109 | 18.4742 | 0.0000 | 0.1793 | 0.2219 |
| exerciseYes | -2.1309 | 0.1978 | -10.7739 | 0.0000 | -2.5186 | -1.7432 |
| depressionYes | 0.4018 | 0.2101 | 1.9122 | 0.0559 | -0.0101 | 0.8137 |
| age_group35-49 | 0.4210 | 0.2573 | 1.6361 | 0.1019 | -0.0834 | 0.9253 |
| age_group50-64 | 1.6359 | 0.2454 | 6.6667 | 0.0000 | 1.1549 | 2.1170 |
| age_group65+ | 1.7003 | 0.2381 | 7.1401 | 0.0000 | 1.2335 | 2.1671 |
| income$100K+ | -1.0655 | 0.3800 | -2.8037 | 0.0051 | -1.8104 | -0.3205 |
| income$25K-$49K | -0.4877 | 0.2666 | -1.8292 | 0.0674 | -1.0103 | 0.0350 |
| income$50K-$99K | -1.2234 | 0.2576 | -4.7495 | 0.0000 | -1.7283 | -0.7185 |
| educationCollege graduate | 1.2925 | 0.3998 | 3.2330 | 0.0012 | 0.5088 | 2.0762 |
| educationHS/GED | 0.8799 | 0.3926 | 2.2414 | 0.0250 | 0.1104 | 1.6494 |
| educationSome college | 1.0942 | 0.3966 | 2.7590 | 0.0058 | 0.3168 | 1.8717 |
| gen_healthFair/Poor | 10.0658 | 0.2458 | 40.9539 | 0.0000 | 9.5840 | 10.5476 |
| gen_healthGood | 1.1949 | 0.1782 | 6.7058 | 0.0000 | 0.8456 | 1.5442 |
# Automated forward selection using AIC
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)
mod_forward <- step(mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "forward", trace = 1)
## Start: AIC=34269.45
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 168850 411060 31520
## + menthlth_days 1 66457 513453 33298
## + exercise 1 38219 541691 33726
## + income 3 30450 549460 33844
## + depression 1 21930 557980 33963
## + education 3 10761 569149 34126
## + smoking 2 9915 569995 34135
## + bmi 1 7925 571985 34161
## + age_group 3 7295 572614 34174
## + sex 1 1400 578510 34252
## <none> 579910 34269
##
## Step: AIC=31520.38
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 19271.4 391789 31138
## + exercise 1 7169.1 403891 31382
## + depression 1 4595.4 406465 31432
## + income 3 3091.4 407969 31466
## + age_group 3 1764.5 409295 31492
## + smoking 2 1068.3 409992 31504
## + sex 1 875.5 410185 31505
## <none> 411060 31520
## + bmi 1 89.0 410971 31521
## + education 3 206.8 410853 31522
##
## Step: AIC=31138.25
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 6765.4 385023 31001
## + age_group 3 5018.7 386770 31041
## + income 3 1818.7 389970 31107
## + sex 1 445.3 391343 31131
## + smoking 2 446.8 391342 31133
## + depression 1 228.1 391560 31136
## <none> 391789 31138
## + bmi 1 48.3 391740 31139
## + education 3 137.1 391651 31141
##
## Step: AIC=31000.9
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 4094.6 380929 30921
## + income 3 1057.5 383966 30985
## + sex 1 318.7 384704 30996
## + depression 1 219.6 384804 30998
## + smoking 2 296.4 384727 30999
## + education 3 346.2 384677 31000
## <none> 385023 31001
## + bmi 1 12.0 385011 31003
##
## Step: AIC=30921.36
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 954.84 379974 30907
## + depression 1 259.79 380669 30918
## + sex 1 224.36 380704 30919
## <none> 380929 30921
## + smoking 2 127.87 380801 30923
## + education 3 197.55 380731 30923
## + bmi 1 3.02 380926 30923
##
## Step: AIC=30907.29
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + education 3 603.13 379371 30901
## + depression 1 226.05 379748 30905
## + sex 1 160.83 379813 30906
## <none> 379974 30907
## + bmi 1 0.31 379973 30909
## + smoking 2 69.38 379904 30910
##
## Step: AIC=30900.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + education
##
## Df Sum of Sq RSS AIC
## + depression 1 173.649 379197 30899
## + sex 1 116.600 379254 30900
## <none> 379371 30901
## + smoking 2 140.198 379230 30902
## + bmi 1 0.347 379370 30903
##
## Step: AIC=30898.91
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + education + depression
##
## Df Sum of Sq RSS AIC
## <none> 379197 30899
## + sex 1 86.111 379111 30899
## + smoking 2 126.998 379070 30900
## + bmi 1 2.214 379195 30901
tidy(mod_forward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Forward Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.5069 | 0.4596 | 3.2785 | 0.0010 | 0.6059 | 2.4079 |
| gen_healthFair/Poor | 10.0658 | 0.2458 | 40.9539 | 0.0000 | 9.5840 | 10.5476 |
| gen_healthGood | 1.1949 | 0.1782 | 6.7058 | 0.0000 | 0.8456 | 1.5442 |
| menthlth_days | 0.2006 | 0.0109 | 18.4742 | 0.0000 | 0.1793 | 0.2219 |
| exerciseYes | -2.1309 | 0.1978 | -10.7739 | 0.0000 | -2.5186 | -1.7432 |
| age_group35-49 | 0.4210 | 0.2573 | 1.6361 | 0.1019 | -0.0834 | 0.9253 |
| age_group50-64 | 1.6359 | 0.2454 | 6.6667 | 0.0000 | 1.1549 | 2.1170 |
| age_group65+ | 1.7003 | 0.2381 | 7.1401 | 0.0000 | 1.2335 | 2.1671 |
| income$100K+ | -1.0655 | 0.3800 | -2.8037 | 0.0051 | -1.8104 | -0.3205 |
| income$25K-$49K | -0.4877 | 0.2666 | -1.8292 | 0.0674 | -1.0103 | 0.0350 |
| income$50K-$99K | -1.2234 | 0.2576 | -4.7495 | 0.0000 | -1.7283 | -0.7185 |
| educationCollege graduate | 1.2925 | 0.3998 | 3.2330 | 0.0012 | 0.5088 | 2.0762 |
| educationHS/GED | 0.8799 | 0.3926 | 2.2414 | 0.0250 | 0.1104 | 1.6494 |
| educationSome college | 1.0942 | 0.3966 | 2.7590 | 0.0058 | 0.3168 | 1.8717 |
| depressionYes | 0.4018 | 0.2101 | 1.9122 | 0.0559 | -0.0101 | 0.8137 |
# Stepwise selection (both)
mod_stepwise <- step(mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "both", trace = 1)
## Start: AIC=34269.45
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 168850 411060 31520
## + menthlth_days 1 66457 513453 33298
## + exercise 1 38219 541691 33726
## + income 3 30450 549460 33844
## + depression 1 21930 557980 33963
## + education 3 10761 569149 34126
## + smoking 2 9915 569995 34135
## + bmi 1 7925 571985 34161
## + age_group 3 7295 572614 34174
## + sex 1 1400 578510 34252
## <none> 579910 34269
##
## Step: AIC=31520.38
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 19271 391789 31138
## + exercise 1 7169 403891 31382
## + depression 1 4595 406465 31432
## + income 3 3091 407969 31466
## + age_group 3 1765 409295 31492
## + smoking 2 1068 409992 31504
## + sex 1 875 410185 31505
## <none> 411060 31520
## + bmi 1 89 410971 31521
## + education 3 207 410853 31522
## - gen_health 2 168850 579910 34269
##
## Step: AIC=31138.25
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 6765 385023 31001
## + age_group 3 5019 386770 31041
## + income 3 1819 389970 31107
## + sex 1 445 391343 31131
## + smoking 2 447 391342 31133
## + depression 1 228 391560 31136
## <none> 391789 31138
## + bmi 1 48 391740 31139
## + education 3 137 391651 31141
## - menthlth_days 1 19271 411060 31520
## - gen_health 2 121665 513453 33298
##
## Step: AIC=31000.9
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 4095 380929 30921
## + income 3 1058 383966 30985
## + sex 1 319 384704 30996
## + depression 1 220 384804 30998
## + smoking 2 296 384727 30999
## + education 3 346 384677 31000
## <none> 385023 31001
## + bmi 1 12 385011 31003
## - exercise 1 6765 391789 31138
## - menthlth_days 1 18868 403891 31382
## - gen_health 2 99662 484685 32838
##
## Step: AIC=30921.36
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 955 379974 30907
## + depression 1 260 380669 30918
## + sex 1 224 380704 30919
## <none> 380929 30921
## + smoking 2 128 380801 30923
## + education 3 198 380731 30923
## + bmi 1 3 380926 30923
## - age_group 3 4095 385023 31001
## - exercise 1 5841 386770 31041
## - menthlth_days 1 21621 402550 31361
## - gen_health 2 92101 473029 32650
##
## Step: AIC=30907.29
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + education 3 603 379371 30901
## + depression 1 226 379748 30905
## + sex 1 161 379813 30906
## <none> 379974 30907
## + bmi 1 0 379973 30909
## + smoking 2 69 379904 30910
## - income 3 955 380929 30921
## - age_group 3 3992 383966 30985
## - exercise 1 5200 385174 31014
## - menthlth_days 1 20471 400444 31325
## - gen_health 2 84470 464444 32509
##
## Step: AIC=30900.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + education
##
## Df Sum of Sq RSS AIC
## + depression 1 174 379197 30899
## + sex 1 117 379254 30900
## <none> 379371 30901
## + smoking 2 140 379230 30902
## + bmi 1 0 379370 30903
## - education 3 603 379974 30907
## - income 3 1360 380731 30923
## - age_group 3 3689 383059 30972
## - exercise 1 5531 384902 31014
## - menthlth_days 1 20150 399521 31313
## - gen_health 2 84954 464325 32513
##
## Step: AIC=30898.91
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + education + depression
##
## Df Sum of Sq RSS AIC
## <none> 379197 30899
## + sex 1 86 379111 30899
## + smoking 2 127 379070 30900
## - depression 1 174 379371 30901
## + bmi 1 2 379195 30901
## - education 3 551 379748 30905
## - income 3 1299 380496 30920
## - age_group 3 3726 382923 30971
## - exercise 1 5512 384709 31012
## - menthlth_days 1 16208 395405 31232
## - gen_health 2 83954 463151 32495
tidy(mod_stepwise, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Stepwise Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.5069 | 0.4596 | 3.2785 | 0.0010 | 0.6059 | 2.4079 |
| gen_healthFair/Poor | 10.0658 | 0.2458 | 40.9539 | 0.0000 | 9.5840 | 10.5476 |
| gen_healthGood | 1.1949 | 0.1782 | 6.7058 | 0.0000 | 0.8456 | 1.5442 |
| menthlth_days | 0.2006 | 0.0109 | 18.4742 | 0.0000 | 0.1793 | 0.2219 |
| exerciseYes | -2.1309 | 0.1978 | -10.7739 | 0.0000 | -2.5186 | -1.7432 |
| age_group35-49 | 0.4210 | 0.2573 | 1.6361 | 0.1019 | -0.0834 | 0.9253 |
| age_group50-64 | 1.6359 | 0.2454 | 6.6667 | 0.0000 | 1.1549 | 2.1170 |
| age_group65+ | 1.7003 | 0.2381 | 7.1401 | 0.0000 | 1.2335 | 2.1671 |
| income$100K+ | -1.0655 | 0.3800 | -2.8037 | 0.0051 | -1.8104 | -0.3205 |
| income$25K-$49K | -0.4877 | 0.2666 | -1.8292 | 0.0674 | -1.0103 | 0.0350 |
| income$50K-$99K | -1.2234 | 0.2576 | -4.7495 | 0.0000 | -1.7283 | -0.7185 |
| educationCollege graduate | 1.2925 | 0.3998 | 3.2330 | 0.0012 | 0.5088 | 2.0762 |
| educationHS/GED | 0.8799 | 0.3926 | 2.2414 | 0.0250 | 0.1104 | 1.6494 |
| educationSome college | 1.0942 | 0.3966 | 2.7590 | 0.0058 | 0.3168 | 1.8717 |
| depressionYes | 0.4018 | 0.2101 | 1.9122 | 0.0559 | -0.0101 | 0.8137 |
# Variables retained in each model
attr(terms(mod_backward), "term.labels")
## [1] "menthlth_days" "exercise" "depression" "age_group"
## [5] "income" "education" "gen_health"
attr(terms(mod_forward), "term.labels")
## [1] "gen_health" "menthlth_days" "exercise" "age_group"
## [5] "income" "education" "depression"
attr(terms(mod_stepwise), "term.labels")
## [1] "gen_health" "menthlth_days" "exercise" "age_group"
## [5] "income" "education" "depression"
# Step 4: Choose final model
# Compare all methods in one table
method_comparison <- tribble(
~Method, ~`Variables selected`, ~`Adj. R²`, ~AIC, ~BIC,
"Backward (AIC)",
length(coef(mod_backward)) - 1,
round(glance(mod_backward)$adj.r.squared, 4),
round(AIC(mod_backward), 1),
round(BIC(mod_backward), 1),
"Forward (AIC)",
length(coef(mod_forward)) - 1,
round(glance(mod_forward)$adj.r.squared, 4),
round(AIC(mod_forward), 1),
round(BIC(mod_forward), 1),
"Stepwise (AIC)",
length(coef(mod_stepwise)) - 1,
round(glance(mod_stepwise)$adj.r.squared, 4),
round(AIC(mod_stepwise), 1),
round(BIC(mod_stepwise), 1)
)
method_comparison |>
kable(caption = "Comparison of Automated Variable Selection Methods") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Method | Variables selected | Adj. R² | AIC | BIC |
|---|---|---|---|---|
| Backward (AIC) | 14 | 0.345 | 53603.9 | 53715.7 |
| Forward (AIC) | 14 | 0.345 | 53603.9 | 53715.7 |
| Stepwise (AIC) | 14 | 0.345 | 53603.9 | 53715.7 |
# As backward, forward, and stepwise all selected the same model,
mod_final <- mod_stepwise
# Final model summary
summary(mod_final)
##
## Call:
## lm(formula = physhlth_days ~ gen_health + menthlth_days + exercise +
## age_group + income + education + depression, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.9865 -2.6810 -1.1454 0.6303 30.2318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.50689 0.45962 3.279 0.00105 **
## gen_healthFair/Poor 10.06584 0.24578 40.954 < 2e-16 ***
## gen_healthGood 1.19494 0.17819 6.706 2.14e-11 ***
## menthlth_days 0.20064 0.01086 18.474 < 2e-16 ***
## exerciseYes -2.13087 0.19778 -10.774 < 2e-16 ***
## age_group35-49 0.42096 0.25730 1.636 0.10187
## age_group50-64 1.63593 0.24539 6.667 2.79e-11 ***
## age_group65+ 1.70027 0.23813 7.140 1.01e-12 ***
## income$100K+ -1.06549 0.38003 -2.804 0.00506 **
## income$25K-$49K -0.48766 0.26660 -1.829 0.06741 .
## income$50K-$99K -1.22340 0.25759 -4.749 2.08e-06 ***
## educationCollege graduate 1.29249 0.39978 3.233 0.00123 **
## educationHS/GED 0.87988 0.39256 2.241 0.02503 *
## educationSome college 1.09424 0.39661 2.759 0.00581 **
## depressionYes 0.40182 0.21013 1.912 0.05588 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.891 on 7985 degrees of freedom
## Multiple R-squared: 0.3461, Adjusted R-squared: 0.345
## F-statistic: 301.9 on 14 and 7985 DF, p-value: < 2.2e-16
# Coefficient table with 95% confidence intervals
final_coef_table <- tidy(mod_final, conf.int = TRUE)
final_coef_table |>
kable(digits = 3, caption = "Coefficient Table for the Final Linear Regression Model") |>
kable_styling(full_width = FALSE)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 1.507 | 0.460 | 3.279 | 0.001 | 0.606 | 2.408 |
| gen_healthFair/Poor | 10.066 | 0.246 | 40.954 | 0.000 | 9.584 | 10.548 |
| gen_healthGood | 1.195 | 0.178 | 6.706 | 0.000 | 0.846 | 1.544 |
| menthlth_days | 0.201 | 0.011 | 18.474 | 0.000 | 0.179 | 0.222 |
| exerciseYes | -2.131 | 0.198 | -10.774 | 0.000 | -2.519 | -1.743 |
| age_group35-49 | 0.421 | 0.257 | 1.636 | 0.102 | -0.083 | 0.925 |
| age_group50-64 | 1.636 | 0.245 | 6.667 | 0.000 | 1.155 | 2.117 |
| age_group65+ | 1.700 | 0.238 | 7.140 | 0.000 | 1.233 | 2.167 |
| income$100K+ | -1.065 | 0.380 | -2.804 | 0.005 | -1.810 | -0.321 |
| income$25K-$49K | -0.488 | 0.267 | -1.829 | 0.067 | -1.010 | 0.035 |
| income$50K-$99K | -1.223 | 0.258 | -4.749 | 0.000 | -1.728 | -0.718 |
| educationCollege graduate | 1.292 | 0.400 | 3.233 | 0.001 | 0.509 | 2.076 |
| educationHS/GED | 0.880 | 0.393 | 2.241 | 0.025 | 0.110 | 1.649 |
| educationSome college | 1.094 | 0.397 | 2.759 | 0.006 | 0.317 | 1.872 |
| depressionYes | 0.402 | 0.210 | 1.912 | 0.056 | -0.010 | 0.814 |
# Final model fit statistics
tibble(
Model = "Final Model",
R_squared = summary(mod_final)$r.squared,
Adjusted_R_squared = summary(mod_final)$adj.r.squared,
AIC = AIC(mod_final),
BIC = BIC(mod_final)
) |>
kable(digits = 3, caption = "Fit Statistics for the Final Linear Regression Model") |>
kable_styling(full_width = FALSE)
| Model | R_squared | Adjusted_R_squared | AIC | BIC |
|---|---|---|---|---|
| Final Model | 0.346 | 0.345 | 53603.93 | 53715.73 |
# Show retained variables in final model
attr(terms(mod_final), "term.labels")
## [1] "gen_health" "menthlth_days" "exercise" "age_group"
## [5] "income" "education" "depression"
tidy(mod_final, conf.int = TRUE)
## # A tibble: 15 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.51 0.460 3.28 1.05e- 3 0.606 2.41
## 2 gen_healthFair/Poor 10.1 0.246 41.0 0 9.58 10.5
## 3 gen_healthGood 1.19 0.178 6.71 2.14e-11 0.846 1.54
## 4 menthlth_days 0.201 0.0109 18.5 1.18e-74 0.179 0.222
## 5 exerciseYes -2.13 0.198 -10.8 6.99e-27 -2.52 -1.74
## 6 age_group35-49 0.421 0.257 1.64 1.02e- 1 -0.0834 0.925
## 7 age_group50-64 1.64 0.245 6.67 2.79e-11 1.15 2.12
## 8 age_group65+ 1.70 0.238 7.14 1.01e-12 1.23 2.17
## 9 income$100K+ -1.07 0.380 -2.80 5.06e- 3 -1.81 -0.321
## 10 income$25K-$49K -0.488 0.267 -1.83 6.74e- 2 -1.01 0.0350
## 11 income$50K-$99K -1.22 0.258 -4.75 2.08e- 6 -1.73 -0.718
## 12 educationCollege gr… 1.29 0.400 3.23 1.23e- 3 0.509 2.08
## 13 educationHS/GED 0.880 0.393 2.24 2.50e- 2 0.110 1.65
## 14 educationSome colle… 1.09 0.397 2.76 5.81e- 3 0.317 1.87
## 15 depressionYes 0.402 0.210 1.91 5.59e- 2 -0.0101 0.814
# Step 5: LINE assumption check
par(mfrow = c(2, 2))
plot(mod_final)
par(mfrow = c(1, 1))
Part 1, Step 1: Maximum Model Interpretation
The maximum linear regression model explained a moderate amount of variation in physically unhealthy days, with an R² of 0.347 and an adjusted R² of 0.345, meaning about 34.5% of the variability in physical health burden was explained by the predictors in the model. The model AIC was 53,606.97 and BIC was 53,746.71. Several predictors were strongly associated with more unhealthy days, especially fair/poor general health (β = 10.06, 95% CI: 9.57, 10.55, p<0.001), good general health (β = 1.19, 95% CI: 0.83, 1.54, p<0.001), and mental unhealthy days (β = 0.20, 95% CI: 0.18, 0.22, p<0.001). Exercise was protective, with those reporting exercise having 2.12 fewer unhealthy days on average than non-exercisers (β = -2.12, p<0.001).
Part 1, Step 2: Best Subsets Interpretation
The best subsets regression showed that the model maximizing adjusted R² included 17 parameters, whereas the model minimizing BIC included only 7 parameters. These criteria did not agree. Adjusted R² favored a larger, more complex model, while BIC favored a simpler model. This is expected because BIC penalizes model complexity more strongly. The BIC-selected model retained mental unhealthy days, exercise, age 50–64, age 65+, income $50K–$99K, and general health indicators, suggesting these were the most important predictors.
Part 1, Step 3: Stepwise Methods Comparison
Backward elimination, forward selection, and stepwise selection all resulted in the same final model with 14 variables retained. All three methods also had identical adjusted R² (0.345), AIC (53,603.9), and BIC (53,715.7), showing strong agreement and model stability. Variables excluded by all methods were BMI, sex, and smoking, while predictors such as mental unhealthy days, exercise, age group, income, education, and general health were retained across all methods. Because all methods agreed, there was strong evidence that the selected predictors were robust.
Part 1, Step 4: Final Model Interpretation
Because all three automated selection methods produced the same final model, I selected this model as the final model. Holding all other variables constant, each additional mentally unhealthy day was associated with 0.20 more physically unhealthy days (β = 0.2006, p<0.001). Adults who exercised had 2.13 fewer unhealthy days than those who did not exercise (β = -2.1309, p<0.001). Compared with adults reporting excellent or very good health, those reporting fair/poor health had 10.07 more unhealthy days on average (β = 10.07, p<0.001), indicating general health was the strongest predictor.
Part 1, Step 5: LINE Assumptions
The Residuals vs Fitted plot suggested the linearity assumption was reasonably satisfied, with only mild evidence of non-linearity. The Normal Q-Q plot showed residuals were approximately normal, although some deviation was observed in the tails. The Scale-Location plot suggested roughly constant variance, with only minor heteroscedasticity. The Residuals vs Leverage plot did not suggest major influential observations. Overall, the LINE assumptions appeared reasonably satisfied, with only minor deviations that were not severe enough to invalidate the model.
# Step 1 Create binary outcome
brfss_analytic <- brfss_analytic |>
mutate(
frequent_distress = factor(
ifelse(physhlth_days >= 14, "Yes", "No"),
levels = c("No", "Yes")
)
)
# Prevalence: counts
table(brfss_analytic$frequent_distress)
##
## No Yes
## 6927 1073
# Prevalence: percentages
prop.table(table(brfss_analytic$frequent_distress)) * 100
##
## No Yes
## 86.5875 13.4125
# Formatted prevalence table
distress_prev <- brfss_analytic |>
count(frequent_distress) |>
mutate(
Percent = round(100 * n / sum(n), 1)
)
distress_prev |>
kable(
caption = "Prevalence of Frequent Physical Distress",
col.names = c("Frequent Distress", "Count", "Percent")
) |>
kable_styling(full_width = FALSE)
| Frequent Distress | Count | Percent |
|---|---|---|
| No | 6927 | 86.6 |
| Yes | 1073 | 13.4 |
# Step 2: Simple (Unadjusted) Logistic Regression
# Unadjusted model using general health
mod_simple <- glm(
frequent_distress ~ gen_health,
data = brfss_analytic,
family = binomial
)
# Model summary
summary(mod_simple)
##
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial,
## data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.34421 0.08725 -38.329 <2e-16 ***
## gen_healthFair/Poor 3.33700 0.10245 32.571 <2e-16 ***
## gen_healthGood 1.07595 0.10999 9.782 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6306.5 on 7999 degrees of freedom
## Residual deviance: 4741.8 on 7997 degrees of freedom
## AIC: 4747.8
##
## Number of Fisher Scoring iterations: 6
# Odds ratios
exp(coef(mod_simple))
## (Intercept) gen_healthFair/Poor gen_healthGood
## 0.03528801 28.13465602 2.93277488
# 95% CI for ORs
exp(confint(mod_simple))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.02960235 0.04168464
## gen_healthFair/Poor 23.08938016 34.50941676
## gen_healthGood 2.36832877 3.64628926
# Formatted OR table
simple_or <- tidy(
mod_simple,
exponentiate = TRUE,
conf.int = TRUE
)
simple_or |>
kable(
digits = 3,
caption = "Unadjusted Odds Ratios for General Health and Frequent Physical Distress"
) |>
kable_styling(full_width = FALSE)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.035 | 0.087 | -38.329 | 0 | 0.030 | 0.042 |
| gen_healthFair/Poor | 28.135 | 0.102 | 32.571 | 0 | 23.089 | 34.509 |
| gen_healthGood | 2.933 | 0.110 | 9.782 | 0 | 2.368 | 3.646 |
#Publication-Ready Table
mod_simple |>
tbl_regression(
exponentiate = TRUE,
label = list(gen_health ~ "General Health in past 30 days")
) |>
bold_labels() |>
bold_p()
| Characteristic | OR | 95% CI | p-value |
|---|---|---|---|
| General Health in past 30 days | |||
| Excellent/Very Good | — | — | |
| Fair/Poor | 28.1 | 23.1, 34.5 | <0.001 |
| Good | 2.93 | 2.37, 3.65 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
# Wald p-values (for significance)
summary(mod_simple)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.344212 0.08724909 -38.329475 1.976263e-321
## gen_healthFair/Poor 3.337002 0.10245176 32.571446 1.040833e-232
## gen_healthGood 1.075949 0.10999156 9.782105 1.343856e-22
# Step 3: Multiple Logistic Regression
mod_logistic <- glm(
frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health,
data = brfss_analytic,
family = binomial
)
# Model summary
summary(mod_logistic)
##
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + sex +
## exercise + depression + age_group + income + education +
## smoking + gen_health, family = binomial, data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.4819971 0.2754154 -12.643 < 2e-16 ***
## menthlth_days 0.0531123 0.0043127 12.315 < 2e-16 ***
## bmi 0.0007573 0.0054204 0.140 0.8889
## sexFemale 0.1146198 0.0815481 1.406 0.1599
## exerciseYes -0.7243823 0.0849653 -8.526 < 2e-16 ***
## depressionYes 0.0487396 0.0981422 0.497 0.6195
## age_group35-49 0.1833795 0.1561646 1.174 0.2403
## age_group50-64 0.7804061 0.1425411 5.475 4.38e-08 ***
## age_group65+ 0.7600142 0.1418878 5.356 8.49e-08 ***
## income$100K+ -0.4860535 0.2285625 -2.127 0.0335 *
## income$25K-$49K -0.0673544 0.1116534 -0.603 0.5463
## income$50K-$99K -0.4553597 0.1145283 -3.976 7.01e-05 ***
## educationCollege graduate 0.2555872 0.1745785 1.464 0.1432
## educationHS/GED 0.1958283 0.1637161 1.196 0.2316
## educationSome college 0.1311359 0.1675926 0.782 0.4339
## smokingCurrent 0.0406113 0.1209912 0.336 0.7371
## smokingFormer 0.1035453 0.0915102 1.132 0.2578
## gen_healthFair/Poor 2.6478096 0.1136015 23.308 < 2e-16 ***
## gen_healthGood 0.7880321 0.1147087 6.870 6.43e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6306.5 on 7999 degrees of freedom
## Residual deviance: 4377.9 on 7981 degrees of freedom
## AIC: 4415.9
##
## Number of Fisher Scoring iterations: 6
mod_logistic |>
tbl_regression(
exponentiate = TRUE,
label = list(
menthlth_days ~ "Menatal Unhealth days (past 30 days)",
bmi ~ "BMI",
exercise ~ "Exercise (past 30 days)",
depression ~ "Depression",
education ~ "Education Level",
sex ~ "Sex",
age_group ~ "Age group",
smoking ~ "Smoking status",
income ~ "Annual Household income",
gen_health ~ "General Healath Status"
)
) |>
bold_labels() |>
bold_p()
| Characteristic | OR | 95% CI | p-value |
|---|---|---|---|
| Menatal Unhealth days (past 30 days) | 1.05 | 1.05, 1.06 | <0.001 |
| BMI | 1.00 | 0.99, 1.01 | 0.9 |
| Sex | |||
| Male | — | — | |
| Female | 1.12 | 0.96, 1.32 | 0.2 |
| Exercise (past 30 days) | |||
| No | — | — | |
| Yes | 0.48 | 0.41, 0.57 | <0.001 |
| Depression | |||
| No | — | — | |
| Yes | 1.05 | 0.87, 1.27 | 0.6 |
| Age group | |||
| 18-34 | — | — | |
| 35-49 | 1.20 | 0.89, 1.63 | 0.2 |
| 50-64 | 2.18 | 1.66, 2.90 | <0.001 |
| 65+ | 2.14 | 1.63, 2.84 | <0.001 |
| Annual Household income | |||
| Less than $25K | — | — | |
| $100K+ | 0.62 | 0.39, 0.95 | 0.033 |
| $25K-$49K | 0.93 | 0.75, 1.16 | 0.5 |
| $50K-$99K | 0.63 | 0.51, 0.79 | <0.001 |
| Education Level | |||
| Less than HS | — | — | |
| College graduate | 1.29 | 0.92, 1.82 | 0.14 |
| HS/GED | 1.22 | 0.88, 1.68 | 0.2 |
| Some college | 1.14 | 0.82, 1.59 | 0.4 |
| Smoking status | |||
| Never | — | — | |
| Current | 1.04 | 0.82, 1.32 | 0.7 |
| Former | 1.11 | 0.93, 1.33 | 0.3 |
| General Healath Status | |||
| Excellent/Very Good | — | — | |
| Fair/Poor | 14.1 | 11.3, 17.7 | <0.001 |
| Good | 2.20 | 1.76, 2.76 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
# Optional fit statistics
AIC(mod_logistic)
## [1] 4415.938
# Step 4: Likelihood Ratio Test
# Testing income as a group
# Reduced model without income
mod_reduced <- glm(
frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
age_group + education + smoking + gen_health,
data = brfss_analytic,
family = binomial
)
# Likelihood ratio test
lrt_income <- anova(mod_reduced, mod_logistic, test = "Chisq")
lrt_income
## Analysis of Deviance Table
##
## Model 1: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + education + smoking + gen_health
## Model 2: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7984 4400.2
## 2 7981 4377.9 3 22.235 5.828e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Formatted table
as.data.frame(lrt_income) |>
tibble::rownames_to_column("Model") |>
kable(
digits = 3,
caption = "Likelihood Ratio Test Comparing Reduced Model and Full Model"
) |>
kable_styling(full_width = FALSE)
| Model | Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
|---|---|---|---|---|---|
| 1 | 7984 | 4400.173 | NA | NA | NA |
| 2 | 7981 | 4377.938 | 3 | 22.235 | 0 |
# Step 5: ROC Curve and AUC
roc_obj <- roc(
brfss_analytic$frequent_distress,
fitted(mod_logistic)
)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
# ROC curve
plot(
roc_obj,
main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE
)
# AUC
auc(roc_obj)
## Area under the curve: 0.856
# 95% CI for AUC
ci.auc(roc_obj)
## 95% CI: 0.8428-0.8692 (DeLong)
# Optional tidy display
auc_est <- as.numeric(auc(roc_obj))
auc_ci <- ci.auc(roc_obj)
tibble(
AUC = auc_est,
Lower_CI = auc_ci[1],
Upper_CI = auc_ci[3]
) |>
kable(
digits = 3,
caption = "Area Under the ROC Curve (AUC) with 95% Confidence Interval"
) |>
kable_styling(full_width = FALSE)
| AUC | Lower_CI | Upper_CI |
|---|---|---|
| 0.856 | 0.843 | 0.869 |
# Step 6: Hosmer-Lemeshow Test
hl_test <- hoslem.test(
mod_logistic$y,
fitted(mod_logistic),
g = 10
)
hl_test
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 15.776, df = 8, p-value = 0.0457
# Formatted summary table
tibble(
Statistic = as.numeric(hl_test$statistic),
DF = as.numeric(hl_test$parameter),
P_value = hl_test$p.value
) |>
kable(
digits = 3,
caption = "Hosmer-Lemeshow Goodness-of-Fit Test"
) |>
kable_styling(full_width = FALSE)
| Statistic | DF | P_value |
|---|---|---|
| 15.776 | 8 | 0.046 |
Part 2, Step 1: Outcome Balance
The binary outcome frequent physical distress showed a reasonable distribution between cases and non-cases, suggesting the outcome was not extremely imbalanced. This is helpful for logistic regression because severe imbalance can affect model performance and classification accuracy.
Part 2, Step 2: Simple Logistic Regression
General health was chosen for the simple logistic regression because it showed the strongest association in the linear model. Compared with adults reporting excellent or very good health, those reporting fair/poor health had substantially higher odds of frequent physical distress, with a highly statistically significant association (p<0.001). Because the confidence interval excluded 1.0 and the p-value was well below 0.05, the association was statistically significant.
Part 2, Step 2: Simple Logistic Regression
General health was chosen for the simple logistic regression because it showed the strongest association in the linear model. Compared with adults reporting excellent or very good health, those reporting fair/poor health had substantially higher odds of frequent physical distress, with a highly statistically significant association (p<0.001). Because the confidence interval excluded 1.0 and the p-value was well below 0.05, the association was statistically significant.
Part 2, Step 4: Likelihood Ratio Test
The likelihood ratio test examined whether income as a group improved model fit. The deviance difference was 22.235 with 3 degrees of freedom (p<0.001), indicating a statistically significant improvement in model fit when income was included. Therefore, I rejected the null hypothesis and concluded that income contributed significantly to explaining frequent physical distress.
Part 2, Step 5. ROC/AUC
The logistic model had an AUC of 0.856 (95% CI: 0.843, 0.869), indicating excellent discrimination. This means the model performed well in distinguishing individuals with and without frequent physical distress. Because the AUC was between 0.8 and 0.9, the model had excellent predictive performance.
Part 2, Step 6: Hosmer-Lemeshow
The Hosmer-Lemeshow test produced χ² = 15.776 with 8 degrees of freedom and p = 0.046. Because the p-value was slightly below 0.05, there was some evidence of imperfect model fit. However, this result should be interpreted alongside the strong ROC performance (AUC = 0.856), which suggests that although calibration may not be perfect, the model still discriminates well. Together, these findings suggest good predictive performance with minor fit limitations.
A. Statistical Insights
The results suggest that physical health burden among U.S. adults is associated with mental health, exercise, age, income, and especially general health status. In both the linear and logistic models, general health was the strongest predictor. Individuals reporting fair or poor health had substantially more physically unhealthy days in the linear model and much higher odds of frequent physical distress in the logistic model. Mental unhealthy days were also consistently associated with worse physical health outcomes, while exercise appeared protective in both models. Some predictors, such as education, showed significance in the linear model but were not statistically significant in the adjusted logistic model, suggesting some predictors may influence the number of unhealthy days but not necessarily crossing the threshold of frequent distress. A major limitation of this analysis is that BRFSS data are cross-sectional, so associations can be identified but causality cannot be established. In addition, some potential confounders were not measured in the model, including access to healthcare, chronic disease burden, social support, and medication use.
B. Linear versus Logistic Regression
The linear regression model and logistic regression model provided related but different information about the same research question. The linear model estimated how predictors were associated with the number of physically unhealthy days, which provides information about changes in burden on a continuous scale. In contrast, the logistic model estimated odds of experiencing frequent physical distress, which is useful for identifying risk of a clinically meaningful threshold. Linear regression is helpful when interest lies in average changes in outcome levels, while logistic regression is more useful when the outcome is binary or when risk classification is important. The model evaluation criteria also differed across the two approaches. Linear regression relied on R², adjusted R², AIC, and BIC to compare models, whereas logistic regression used AUC and the Hosmer-Lemeshow test to evaluate discrimination and calibration.
C. AI Transparency
I used AI assistance primarily for debugging code and clarifying interpretation after first attempting the analyses independently. I was confused about the sleep variable but when I asked AI it clarify me about the unavailability of the variable on the dataset. I also asked questions to AI for different codeing steps that were present in the lecture to clarify me about why a specific code is being applied and how its working.