library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
## Warning: package 'haven' was built under R version 4.4.3
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.4.3
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.4.3
## ResourceSelection 0.3-6 2023-06-27
brfss_data <- read_xpt("/Users/sarah/OneDrive/Documents/EPI 553/data/LLCP2023.XPT") %>%
janitor::clean_names()
names(brfss_data)
## [1] "state" "fmonth" "idate" "imonth" "iday" "iyear"
## [7] "dispcode" "seqno" "psu" "ctelenm1" "pvtresd1" "colghous"
## [13] "statere1" "celphon1" "ladult1" "numadult" "respslc1" "landsex2"
## [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1" "cellsex2"
## [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1" "landline" "hhadult"
## [31] "sexvar" "genhlth" "physhlth" "menthlth" "poorhlth" "primins1"
## [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
## [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6"
## [49] "bpmeds1" "cholchk3" "toldhi3" "cholmed3" "cvdinfr4" "cvdcrhd4"
## [55] "cvdstrk3" "asthma3" "asthnow" "chcscnc1" "chcocnc1" "chccopd3"
## [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital"
## [67] "educa" "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
## [73] "employ1" "children" "income3" "pregnant" "weight2" "height3"
## [79] "deaf" "blind" "decide" "diffwalk" "diffdres" "diffalon"
## [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3" "ecignow2"
## [91] "alcday4" "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
## [97] "pneuvac4" "shingle2" "hivtst7" "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu" "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"
## [127] "howlong" "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage" "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum"
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry" "cddiscu1" "cdhous1" "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload" "loadulk2" "hasymp1" "hasymp2"
## [193] "hasymp3" "hasymp4" "hasymp5" "hasymp6" "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin"
## [205] "birthsex" "somale" "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat" "marjvape" "marjdab" "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat" "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver" "qstlang"
## [259] "metstat" "urbstat" "mscode" "ststr" "strwt" "rawrake"
## [265] "wt2rake" "imprace" "chispnc" "crace1" "cageg" "cllcpwt"
## [271] "dualuse" "dualcor" "llcpwt2" "llcpwt" "rfhlth" "phys14d"
## [277] "ment14d" "hlthpl1" "hcvu653" "totinda" "metvl12" "metvl22"
## [283] "maxvo21" "fc601" "actin13" "actin23" "padur1" "padur2"
## [289] "pafreq1" "pafreq2" "minac12" "minac22" "strfreq" "pamiss3"
## [295] "pamin13" "pamin23" "pa3min" "pavig13" "pavig23" "pa3vigm"
## [301] "pacat3" "paindx3" "pa150r4" "pa300r4" "pa30023" "pastrng"
## [307] "parec3" "pastae3" "rfhype6" "cholch3" "rfchol3" "michd"
## [313] "ltasth1" "casthm1" "asthms1" "drdxar2" "mrace1" "hispanc"
## [319] "race" "raceg21" "racegr3" "raceprv" "sex" "ageg5yr"
## [325] "age65yr" "age80" "age_g" "htin4" "htm4" "wtkg3"
## [331] "bmi5" "bmi5cat" "rfbmi5" "chldcnt" "educag" "incomg1"
## [337] "smoker3" "rfsmok3" "cureci2" "drnkany6" "drocdy4" "rfbing6"
## [343] "drnkwk2" "rfdrhv8" "flshot7" "pneumo3" "aidtst4" "rfseat2"
## [349] "rfseat3" "drnkdrv"
#Select variables
brfss_work<- brfss_data |>
dplyr::select(physhlth, menthlth, bmi5, sexvar, educa, exerany2, addepev3, ageg5yr, incomg1, smoker3, genhlth, marital)
brfss_clean <- brfss_work |>
mutate(
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth %in% c (77,99) ~ NA_real_,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth)
),
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth %in% c(77,99) ~ NA_real_,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth)
),
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
TRUE ~ bmi5 / 100
),
sex= factor(
sexvar,
levels = c(1,2),
labels = c("Male","Female")
),
exercise = factor(exerany2,
levels = c(2,1),
labels = c("No", "Yes")),
depression = factor(addepev3,
levels = c(2,1),
labels = c("No", "Yes")),
age_group = case_when(
ageg5yr %in% 1:3 ~ "18-34",
ageg5yr %in% 4:6 ~ "35-49",
ageg5yr %in% 7:9 ~ "50-64",
ageg5yr %in% 10:13 ~ "65+",
TRUE ~ NA_character_
),
smoking = case_when(
smoker3 %in% 1:2 ~ "Current",
smoker3 == 3 ~ "Former",
smoker3 == 4 ~ "Never",
TRUE ~ NA_character_
),
income = case_when(
incomg1 %in% 1:2 ~ "Less than $25K",
incomg1 %in% 3:4 ~ "$25K-$49K",
incomg1 %in% 5:6 ~ "$50K-$99K",
incomg1 == 7 ~ "$100K+",
TRUE ~ NA_character_
),
gen_health = case_when(
genhlth %in% 1:2 ~ "Excellent/Very Good",
genhlth == 3 ~ "Good",
genhlth %in% 4:5 ~ "Fair/Poor",
TRUE ~ NA_character_
),
education = case_when(
educa %in% 1:3 ~ "Less than HS",
educa == 4 ~ "HS/GED",
educa == 5 ~ "Some college",
educa == 6 ~ "College graduate",
TRUE ~ NA_character_
),
marital = case_when(
marital %in% c(1, 6) ~ "Married/Partnered",
marital %in% 2:4 ~ "Previously married",
marital == 5 ~ "Never married",
TRUE ~ NA_character_
)
) %>%
mutate(
# Set reference levels
exercise = relevel(exercise, ref = "No"),
depression = relevel(depression, ref = "No"),
age_group = factor(age_group, levels = c("18-34", "35-49", "50-64", "65+")),
smoking = factor(smoking, levels = c("Never", "Former", "Current")),
gen_health = factor(gen_health, levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
education = factor(education, levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),
income = factor(income, levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),
marital = factor(marital, levels = c("Married/Partnered", "Previously married", "Never married"))
)
#Missingness before dropping NAs
missing_summary <- brfss_clean |>
summarise(
n_missing_physhlth = sum(is.na(physhlth_days)),
pct_missing_physhlth = mean(is.na(physhlth_days)) * 100,
n_missing_menthlth = sum(is.na(menthlth_days)),
pct_missing_menthlth = mean(is.na(menthlth_days)) * 100,
n_missing_bmi = sum(is.na(bmi)),
pct_missing_bmi = mean(is.na(bmi)) * 100
)
missing_summary
## # A tibble: 1 × 6
## n_missing_physhlth pct_missing_physhlth n_missing_menthlth
## <int> <dbl> <int>
## 1 10785 2.49 8108
## # ℹ 3 more variables: pct_missing_menthlth <dbl>, n_missing_bmi <int>,
## # pct_missing_bmi <dbl>
#remove NA and sample
set.seed(1220)
brfss_analytic <- brfss_clean %>%
drop_na()%>%
slice_sample(n = 8000)
#final analytic sample size
nrow(brfss_analytic)
## [1] 8000
brfss_analytic %>%
dplyr::select(physhlth_days, menthlth_days, sex, exercise, depression, age_group, smoking, income, gen_health, education, marital) %>%
tbl_summary(
label = list(
physhlth_days ~ "Physically Unhealthy Days",
menthlth_days ~ "Mentally Unhealthy Days",
sex ~ "Sex",
exercise ~ "Any Physical Activity",
depression ~ "Depressive disorder",
age_group ~ "Age",
smoking ~ "Smoking Status",
income ~ "Income",
gen_health ~ "General Health Status",
education ~ "Education Level",
marital ~ "Marital Status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) %>%
add_n() %>%
bold_labels() %>%
modify_caption("**Table 1. Descriptive Statistics - BRFSS 2023 Analytic Sample ( n =8,000)")
| Characteristic | N | N = 8,0001 |
|---|---|---|
| Physically Unhealthy Days | 8,000 | 4.3 (8.7) |
| Mentally Unhealthy Days | 8,000 | 4.3 (8.2) |
| Sex | 8,000 | |
| Male | 3,971 (50%) | |
| Female | 4,029 (50%) | |
| Any Physical Activity | 8,000 | 6,157 (77%) |
| Depressive disorder | 8,000 | 1,776 (22%) |
| Age | 8,000 | |
| 18-34 | 1,294 (16%) | |
| 35-49 | 1,657 (21%) | |
| 50-64 | 2,109 (26%) | |
| 65+ | 2,940 (37%) | |
| Smoking Status | 8,000 | |
| Never | 4,808 (60%) | |
| Former | 2,230 (28%) | |
| Current | 962 (12%) | |
| Income | 8,000 | |
| Less than $25K | 1,090 (14%) | |
| $25K-$49K | 1,931 (24%) | |
| $50K-$99K | 4,297 (54%) | |
| $100K+ | 682 (8.5%) | |
| General Health Status | 8,000 | |
| Excellent/Very Good | 3,926 (49%) | |
| Good | 2,648 (33%) | |
| Fair/Poor | 1,426 (18%) | |
| Education Level | 8,000 | |
| Less than HS | 391 (4.9%) | |
| HS/GED | 1,887 (24%) | |
| Some college | 2,115 (26%) | |
| College graduate | 3,607 (45%) | |
| Marital Status | 8,000 | |
| Married/Partnered | 4,582 (57%) | |
| Previously married | 2,050 (26%) | |
| Never married | 1,368 (17%) | |
| 1 Mean (SD); n (%) | ||
Part 1: Model Selection for Linear Regression
max_mod <- lm(physhlth_days ~ menthlth_days + sex + exercise + depression + age_group + smoking + income + gen_health + education + marital, data = brfss_analytic)
tidy(max_mod, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Maximum Model: All Predictors",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI lower", "CI upper")
)|>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI lower | CI upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.6779 | 0.5148 | 3.2593 | 0.0011 | 0.6688 | 2.6871 |
| menthlth_days | 0.1839 | 0.0114 | 16.1977 | 0.0000 | 0.1616 | 0.2062 |
| sexFemale | 0.2350 | 0.1639 | 1.4336 | 0.1517 | -0.0863 | 0.5563 |
| exerciseYes | -1.9039 | 0.2031 | -9.3729 | 0.0000 | -2.3020 | -1.5057 |
| depressionYes | 0.7573 | 0.2149 | 3.5234 | 0.0004 | 0.3360 | 1.1787 |
| age_group35-49 | 0.7358 | 0.2827 | 2.6026 | 0.0093 | 0.1816 | 1.2901 |
| age_group50-64 | 1.3674 | 0.2778 | 4.9222 | 0.0000 | 0.8228 | 1.9120 |
| age_group65+ | 1.7255 | 0.2785 | 6.1951 | 0.0000 | 1.1795 | 2.2715 |
| smokingFormer | 0.4346 | 0.1878 | 2.3149 | 0.0206 | 0.0666 | 0.8027 |
| smokingCurrent | 0.4963 | 0.2653 | 1.8711 | 0.0614 | -0.0237 | 1.0164 |
| income$25K-$49K | -1.0835 | 0.2763 | -3.9217 | 0.0001 | -1.6251 | -0.5419 |
| income$50K-$99K | -1.5210 | 0.2761 | -5.5092 | 0.0000 | -2.0621 | -0.9798 |
| income$100K+ | -1.2931 | 0.3979 | -3.2495 | 0.0012 | -2.0731 | -0.5130 |
| gen_healthGood | 1.3654 | 0.1839 | 7.4264 | 0.0000 | 1.0050 | 1.7259 |
| gen_healthFair/Poor | 10.0022 | 0.2487 | 40.2182 | 0.0000 | 9.5147 | 10.4897 |
| educationHS/GED | 0.2889 | 0.4009 | 0.7206 | 0.4712 | -0.4970 | 1.0748 |
| educationSome college | 0.9892 | 0.4027 | 2.4566 | 0.0140 | 0.1998 | 1.7785 |
| educationCollege graduate | 1.0547 | 0.4045 | 2.6075 | 0.0091 | 0.2618 | 1.8475 |
| maritalPreviously married | -0.2536 | 0.2066 | -1.2273 | 0.2197 | -0.6586 | 0.1514 |
| maritalNever married | -0.4054 | 0.2490 | -1.6283 | 0.1035 | -0.8934 | 0.0826 |
# Reporting R^2, adjusted R^2, AIC,BIC
glance(max_mod) |>
dplyr::select(r.squared, adj.r.squared, sigma, AIC, BIC, df.residual) |>
mutate(across(everything(), \(x) round(x, 3))) |>
kable(caption = "Maximum Model: Fit Statistics") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| r.squared | adj.r.squared | sigma | AIC | BIC | df.residual |
|---|---|---|---|---|---|
| 0.326 | 0.324 | 7.113 | 54115.19 | 54261.92 | 7980 |
This model explains about 32.6% of the variance in physically unhealthy days (\(R^2\) = 0.326, \(Adjusted R^2\) = 0.324). The AIC is 54,115.19 and BIC is 54,261.92, which provide baseline fit statistics for comparing alternative models.
best_subset <- regsubsets(physhlth_days ~ menthlth_days + sex + exercise + depression + age_group + smoking + income + gen_health + education + marital, data = brfss_analytic, nvmax = 19, method = "exhaustive")
best_subset_summary <- summary(best_subset)
subset_df <- tibble(
p = 1:length(best_subset_summary$adjr2),
`Adj. R²` = best_subset_summary$adjr2,
BIC = best_subset_summary$bic,
Cp = best_subset_summary$cp
)
#Ajust R-squared plot
plot1 <- ggplot(subset_df, aes(x = p, y = `Adj. R²`)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.max(best_subset_summary$adjr2),
linetype = "dashed", color = "tomato") +
labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
theme_minimal(base_size = 12)
#BIC plot
plot2 <- ggplot(subset_df, aes(x = p, y = BIC)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.min(best_subset_summary$bic),
linetype = "dashed", color = "tomato") +
labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
theme_minimal(base_size = 12)
gridExtra::grid.arrange(plot1, plot2, ncol = 2)
cat("Best model by Adj. R²:", which.max(best_subset_summary$adjr2), "variables\n")
## Best model by Adj. R²: 18 variables
cat("Best model by BIC:", which.min(best_subset_summary$bic), "variables\n")
## Best model by BIC: 9 variables
The two criteria do not agree on the best model size. \(Adjusted R^2\) selected a 17-variable model because it continues to increase as more predictors are added. BIC selects a 10-variable model due to the stronger penalty for model complexity. BIC favors a simpler model.
#backward elimination
backward_mod <- step(max_mod, direction = "backward", trace =1)
## Start: AIC=31410.17
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + smoking + income + gen_health + education + marital
##
## Df Sum of Sq RSS AIC
## - marital 2 171 403887 31410
## <none> 403716 31410
## - sex 1 104 403820 31410
## - smoking 2 360 404077 31413
## - depression 1 628 404345 31421
## - education 3 876 404592 31422
## - income 3 1547 405263 31435
## - age_group 3 2242 405959 31448
## - exercise 1 4444 408161 31496
## - menthlth_days 1 13273 416990 31667
## - gen_health 2 85586 489303 32944
##
## Step: AIC=31409.55
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + smoking + income + gen_health + education
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
tidy(backward_mod, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(caption = "Backward Elimination Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.3853 | 0.4874 | 2.8425 | 0.0045 | 0.4300 | 2.3407 |
| menthlth_days | 0.1836 | 0.0114 | 16.1748 | 0.0000 | 0.1614 | 0.2059 |
| sexFemale | 0.2329 | 0.1633 | 1.4262 | 0.1539 | -0.0872 | 0.5530 |
| exerciseYes | -1.9048 | 0.2031 | -9.3793 | 0.0000 | -2.3029 | -1.5067 |
| depressionYes | 0.7471 | 0.2149 | 3.4769 | 0.0005 | 0.3259 | 1.1683 |
| age_group35-49 | 0.8374 | 0.2703 | 3.0980 | 0.0020 | 0.3075 | 1.3672 |
| age_group50-64 | 1.4778 | 0.2583 | 5.7206 | 0.0000 | 0.9714 | 1.9841 |
| age_group65+ | 1.8366 | 0.2513 | 7.3099 | 0.0000 | 1.3441 | 2.3292 |
| smokingFormer | 0.4399 | 0.1876 | 2.3442 | 0.0191 | 0.0720 | 0.8077 |
| smokingCurrent | 0.4776 | 0.2647 | 1.8038 | 0.0713 | -0.0414 | 0.9965 |
| income$25K-$49K | -1.0324 | 0.2749 | -3.7557 | 0.0002 | -1.5713 | -0.4936 |
| income$50K-$99K | -1.3884 | 0.2658 | -5.2239 | 0.0000 | -1.9094 | -0.8674 |
| income$100K+ | -1.1122 | 0.3849 | -2.8897 | 0.0039 | -1.8666 | -0.3577 |
| gen_healthGood | 1.3636 | 0.1839 | 7.4161 | 0.0000 | 1.0032 | 1.7241 |
| gen_healthFair/Poor | 10.0009 | 0.2486 | 40.2245 | 0.0000 | 9.5136 | 10.4883 |
| educationHS/GED | 0.2576 | 0.4006 | 0.6431 | 0.5202 | -0.5276 | 1.0428 |
| educationSome college | 0.9636 | 0.4025 | 2.3943 | 0.0167 | 0.1747 | 1.7525 |
| educationCollege graduate | 1.0310 | 0.4043 | 2.5502 | 0.0108 | 0.2385 | 1.8236 |
#Forward selection
null_mod <- lm(physhlth_days ~ 1, data = brfss_analytic)
forward_mod <- step(max_mod, scope = list(lower = null_mod, upper = max_mod), direction = "forward", trace = 1)
## Start: AIC=31410.17
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + smoking + income + gen_health + education + marital
tidy(forward_mod, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x,4))) |>
kable(
caption = "Forward Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.6779 | 0.5148 | 3.2593 | 0.0011 | 0.6688 | 2.6871 |
| menthlth_days | 0.1839 | 0.0114 | 16.1977 | 0.0000 | 0.1616 | 0.2062 |
| sexFemale | 0.2350 | 0.1639 | 1.4336 | 0.1517 | -0.0863 | 0.5563 |
| exerciseYes | -1.9039 | 0.2031 | -9.3729 | 0.0000 | -2.3020 | -1.5057 |
| depressionYes | 0.7573 | 0.2149 | 3.5234 | 0.0004 | 0.3360 | 1.1787 |
| age_group35-49 | 0.7358 | 0.2827 | 2.6026 | 0.0093 | 0.1816 | 1.2901 |
| age_group50-64 | 1.3674 | 0.2778 | 4.9222 | 0.0000 | 0.8228 | 1.9120 |
| age_group65+ | 1.7255 | 0.2785 | 6.1951 | 0.0000 | 1.1795 | 2.2715 |
| smokingFormer | 0.4346 | 0.1878 | 2.3149 | 0.0206 | 0.0666 | 0.8027 |
| smokingCurrent | 0.4963 | 0.2653 | 1.8711 | 0.0614 | -0.0237 | 1.0164 |
| income$25K-$49K | -1.0835 | 0.2763 | -3.9217 | 0.0001 | -1.6251 | -0.5419 |
| income$50K-$99K | -1.5210 | 0.2761 | -5.5092 | 0.0000 | -2.0621 | -0.9798 |
| income$100K+ | -1.2931 | 0.3979 | -3.2495 | 0.0012 | -2.0731 | -0.5130 |
| gen_healthGood | 1.3654 | 0.1839 | 7.4264 | 0.0000 | 1.0050 | 1.7259 |
| gen_healthFair/Poor | 10.0022 | 0.2487 | 40.2182 | 0.0000 | 9.5147 | 10.4897 |
| educationHS/GED | 0.2889 | 0.4009 | 0.7206 | 0.4712 | -0.4970 | 1.0748 |
| educationSome college | 0.9892 | 0.4027 | 2.4566 | 0.0140 | 0.1998 | 1.7785 |
| educationCollege graduate | 1.0547 | 0.4045 | 2.6075 | 0.0091 | 0.2618 | 1.8475 |
| maritalPreviously married | -0.2536 | 0.2066 | -1.2273 | 0.2197 | -0.6586 | 0.1514 |
| maritalNever married | -0.4054 | 0.2490 | -1.6283 | 0.1035 | -0.8934 | 0.0826 |
#Stepwise selection
stepwise_mod <- step(null_mod, scope = list(lower = null_mod, upper= max_mod), direction = "both", trace = 1)
## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940 416725 31632
## + exercise 1 6467 428198 31849
## + depression 1 6070 428595 31857
## + income 3 3231 431434 31913
## + smoking 2 1776 432889 31938
## + age_group 3 1536 433129 31945
## + sex 1 1254 433411 31946
## + marital 2 830 433835 31956
## + education 3 416 434250 31965
## <none> 434665 31967
## - gen_health 2 164022 598687 34524
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5821 410904 31521
## + age_group 3 4661 412064 31548
## + income 3 1987 414738 31600
## + marital 2 1402 415322 31609
## + smoking 2 1035 415690 31616
## + depression 1 738 415987 31620
## + sex 1 605 416120 31622
## + education 3 319 416406 31632
## <none> 416725 31632
## - menthlth_days 1 17940 434665 31967
## - gen_health 2 121660 538384 33677
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720 407184 31455
## + income 3 1198 409706 31504
## + marital 2 1065 409839 31505
## + depression 1 739 410165 31509
## + smoking 2 706 410198 31512
## + education 3 687 410217 31514
## + sex 1 449 410455 31515
## <none> 410904 31521
## - exercise 1 5821 416725 31632
## - menthlth_days 1 17294 428198 31849
## - gen_health 2 101805 512709 33288
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1163 406021 31438
## + depression 1 933 406251 31438
## + education 3 502 406682 31451
## + sex 1 261 406923 31451
## + smoking 2 361 406823 31451
## <none> 407184 31455
## + marital 2 40 407144 31458
## - age_group 3 3720 410904 31521
## - exercise 1 4880 412064 31548
## - menthlth_days 1 19957 427141 31835
## - gen_health 2 94875 502059 33126
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 862 405159 31423
## + education 3 954 405067 31425
## + smoking 2 305 405716 31436
## + sex 1 203 405818 31436
## <none> 406021 31438
## + marital 2 154 405867 31439
## - income 3 1163 407184 31455
## - age_group 3 3685 409706 31504
## - exercise 1 4214 410235 31518
## - menthlth_days 1 18945 424966 31801
## - gen_health 2 87509 493530 32995
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837 404322 31412
## + smoking 2 259 404900 31422
## + sex 1 110 405049 31423
## <none> 405159 31423
## + marital 2 170 404990 31423
## - depression 1 862 406021 31438
## - income 3 1092 406251 31438
## - age_group 3 3871 409030 31493
## - exercise 1 4219 409378 31504
## - menthlth_days 1 13674 418834 31686
## - gen_health 2 86610 491770 32969
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332 403990 31410
## <none> 404322 31412
## + sex 1 78 404244 31413
## + marital 2 168 404154 31413
## - education 3 837 405159 31423
## - depression 1 745 405067 31425
## - income 3 1495 405818 31436
## - age_group 3 3558 407880 31476
## - exercise 1 4604 408926 31501
## - menthlth_days 1 13607 417929 31675
## - gen_health 2 86813 491135 32964
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 103 403887 31410
## <none> 403990 31410
## + marital 2 170 403820 31410
## - smoking 2 332 404322 31412
## - depression 1 691 404681 31421
## - education 3 910 404900 31422
## - income 3 1457 405447 31432
## - age_group 3 3178 407168 31466
## - exercise 1 4502 408492 31496
## - menthlth_days 1 13347 417337 31668
## - gen_health 2 85552 489542 32942
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## + marital 2 171 403716 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
tidy(stepwise_mod, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Stepwise Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.3853 | 0.4874 | 2.8425 | 0.0045 | 0.4300 | 2.3407 |
| gen_healthGood | 1.3636 | 0.1839 | 7.4161 | 0.0000 | 1.0032 | 1.7241 |
| gen_healthFair/Poor | 10.0009 | 0.2486 | 40.2245 | 0.0000 | 9.5136 | 10.4883 |
| menthlth_days | 0.1836 | 0.0114 | 16.1748 | 0.0000 | 0.1614 | 0.2059 |
| exerciseYes | -1.9048 | 0.2031 | -9.3793 | 0.0000 | -2.3029 | -1.5067 |
| age_group35-49 | 0.8374 | 0.2703 | 3.0980 | 0.0020 | 0.3075 | 1.3672 |
| age_group50-64 | 1.4778 | 0.2583 | 5.7206 | 0.0000 | 0.9714 | 1.9841 |
| age_group65+ | 1.8366 | 0.2513 | 7.3099 | 0.0000 | 1.3441 | 2.3292 |
| income$25K-$49K | -1.0324 | 0.2749 | -3.7557 | 0.0002 | -1.5713 | -0.4936 |
| income$50K-$99K | -1.3884 | 0.2658 | -5.2239 | 0.0000 | -1.9094 | -0.8674 |
| income$100K+ | -1.1122 | 0.3849 | -2.8897 | 0.0039 | -1.8666 | -0.3577 |
| depressionYes | 0.7471 | 0.2149 | 3.4769 | 0.0005 | 0.3259 | 1.1683 |
| educationHS/GED | 0.2576 | 0.4006 | 0.6431 | 0.5202 | -0.5276 | 1.0428 |
| educationSome college | 0.9636 | 0.4025 | 2.3943 | 0.0167 | 0.1747 | 1.7525 |
| educationCollege graduate | 1.0310 | 0.4043 | 2.5502 | 0.0108 | 0.2385 | 1.8236 |
| smokingFormer | 0.4399 | 0.1876 | 2.3442 | 0.0191 | 0.0720 | 0.8077 |
| smokingCurrent | 0.4776 | 0.2647 | 1.8038 | 0.0713 | -0.0414 | 0.9965 |
| sexFemale | 0.2329 | 0.1633 | 1.4262 | 0.1539 | -0.0872 | 0.5530 |
The three AIC based methods did not all select the same final model. Backward elimination and stepwise selection agreed on a 17-variable model, while forward selection retains all predictors from the full model. Several predictors are retained by all methods including menthlth_days, exercise, depression, age_group, gen_health, smoking, education, and sex.
Step 4: Final Model Selection
I selected the stepwise AIC model as my final model. Both backward elimination and stepwise selection agreed on the same 17-predictor model, suggesting a stable solution.
tidy(stepwise_mod, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Final Model: Stepwise Selection (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.3853 | 0.4874 | 2.8425 | 0.0045 | 0.4300 | 2.3407 |
| gen_healthGood | 1.3636 | 0.1839 | 7.4161 | 0.0000 | 1.0032 | 1.7241 |
| gen_healthFair/Poor | 10.0009 | 0.2486 | 40.2245 | 0.0000 | 9.5136 | 10.4883 |
| menthlth_days | 0.1836 | 0.0114 | 16.1748 | 0.0000 | 0.1614 | 0.2059 |
| exerciseYes | -1.9048 | 0.2031 | -9.3793 | 0.0000 | -2.3029 | -1.5067 |
| age_group35-49 | 0.8374 | 0.2703 | 3.0980 | 0.0020 | 0.3075 | 1.3672 |
| age_group50-64 | 1.4778 | 0.2583 | 5.7206 | 0.0000 | 0.9714 | 1.9841 |
| age_group65+ | 1.8366 | 0.2513 | 7.3099 | 0.0000 | 1.3441 | 2.3292 |
| income$25K-$49K | -1.0324 | 0.2749 | -3.7557 | 0.0002 | -1.5713 | -0.4936 |
| income$50K-$99K | -1.3884 | 0.2658 | -5.2239 | 0.0000 | -1.9094 | -0.8674 |
| income$100K+ | -1.1122 | 0.3849 | -2.8897 | 0.0039 | -1.8666 | -0.3577 |
| depressionYes | 0.7471 | 0.2149 | 3.4769 | 0.0005 | 0.3259 | 1.1683 |
| educationHS/GED | 0.2576 | 0.4006 | 0.6431 | 0.5202 | -0.5276 | 1.0428 |
| educationSome college | 0.9636 | 0.4025 | 2.3943 | 0.0167 | 0.1747 | 1.7525 |
| educationCollege graduate | 1.0310 | 0.4043 | 2.5502 | 0.0108 | 0.2385 | 1.8236 |
| smokingFormer | 0.4399 | 0.1876 | 2.3442 | 0.0191 | 0.0720 | 0.8077 |
| smokingCurrent | 0.4776 | 0.2647 | 1.8038 | 0.0713 | -0.0414 | 0.9965 |
| sexFemale | 0.2329 | 0.1633 | 1.4262 | 0.1539 | -0.0872 | 0.5530 |
Holding all other variables constant, each additional mentally unhealthy day is associated with 0.18 more physically unhealthy days. Individuals reporting any exercise in the past 30 days had an average of 1.90 fewer physically unhealthy days compared to those who did not exercise. Adults reporting fair or poor general health experienced about 10 additional physically unhealthy days per month compared to those in excellent or very good health.
par(mfrow = c(2,2))
plot(stepwise_mod)
par(mfrow = c(1,1))
Residuals vs Fitted: The residuals show a curved pattern and the spread increases at higher fitted values, suggesting non-linearity and non-constant variance.
Q-Q Plot: The middle of the distribution follows the reference line reasonably well. The tails strongly deviate.
Scale-Location: The red line slopes upward and the residual spread increases with fitted values, illustrating a fan shape, confirming heteroscedasticity.
Residuals vs Leverage: No points exceed Cook’s distance.
The LINE assumptions are partially satisfied. The residuals plot showed non-linearity, heteroscedasticity, and non-normal tails, suggesting the model fit could improve.
Part 2: Logistic Regression
brfss_analytic <- brfss_analytic |>
mutate(frequent_distress = ifelse(physhlth_days >= 14, "Yes", "No"),
frequent_distress = factor(frequent_distress, levels = c("No", "Yes")))
tibble(Metric = c("Observations", "Variables", "Frequent Distress Cases", "Frequent Distress Prevalence"),
Value = c(nrow(brfss_analytic), ncol(brfss_analytic),
sum(brfss_analytic$frequent_distress == "Yes"),
paste0(round(100 * mean(brfss_analytic$frequent_distress == "Yes"), 1), "%"))) |>
kable(caption = "Frequent Distress Prevalence") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Metric | Value |
|---|---|
| Observations | 8000 |
| Variables | 24 |
| Frequent Distress Cases | 1052 |
| Frequent Distress Prevalence | 13.2% |
Frequent physical distress is relatively uncommon in this sample, with only 13.2% of adults meeting the threshold. This imbalance means the outcome is skewed toward the “No” category.
simple_mod <- glm(frequent_distress ~ gen_health, data = brfss_analytic, family = binomial)
tidy(simple_mod, conf.int = TRUE, exponentiate = FALSE) |>
kable(digits = 3, caption = "Simple Logistic Regression: Frequent Physical Distress ~ General Health Status (Log-Odds Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | -3.398 | 0.090 | -37.672 | 0 | -3.580 | -3.226 |
| gen_healthGood | 1.142 | 0.112 | 10.197 | 0 | 0.924 | 1.364 |
| gen_healthFair/Poor | 3.289 | 0.105 | 31.428 | 0 | 3.087 | 3.498 |
exp(coef(simple_mod))
## (Intercept) gen_healthGood gen_healthFair/Poor
## 0.03342985 3.13235705 26.81066762
exp(confint(simple_mod))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.02787183 0.03970661
## gen_healthGood 2.52057996 3.91109619
## gen_healthFair/Poor 21.91471664 33.03776143
Odds Ratios Gen_health Good: OR = 3.13 95%CI: 2.52 - 3.91 Gen_health Fair/Poor: OR = 26.81 95% CI: 21.91 - 33.04
Compared to adults reporting excellent or very good general health, those reporting good health have 3.13 times the odds of experiencing frequent physical distress. Adults reporting fair or poor health have 26.81 times the odds of frequent physical distress. Both predictors are statistically significant since the confidence intervals exclude 1.
mod_logistic <- glm(
frequent_distress ~ gen_health + menthlth_days + exercise +
age_group + income + depression + education +
smoking + sex,
data = brfss_analytic,
family = binomial
)
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3, caption = "Multiple Logistic Regression") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.021 | 0.237 | -16.298 | 0.000 | 0.013 | 0.033 |
| gen_healthGood | 2.420 | 0.116 | 7.644 | 0.000 | 1.933 | 3.042 |
| gen_healthFair/Poor | 14.535 | 0.114 | 23.580 | 0.000 | 11.670 | 18.215 |
| menthlth_days | 1.046 | 0.004 | 10.298 | 0.000 | 1.037 | 1.055 |
| exerciseYes | 0.561 | 0.085 | -6.794 | 0.000 | 0.475 | 0.663 |
| age_group35-49 | 1.685 | 0.158 | 3.305 | 0.001 | 1.240 | 2.304 |
| age_group50-64 | 2.172 | 0.147 | 5.287 | 0.000 | 1.635 | 2.908 |
| age_group65+ | 2.664 | 0.144 | 6.800 | 0.000 | 2.017 | 3.551 |
| income$25K-$49K | 0.827 | 0.111 | -1.723 | 0.085 | 0.666 | 1.027 |
| income$50K-$99K | 0.645 | 0.111 | -3.946 | 0.000 | 0.519 | 0.802 |
| income$100K+ | 0.637 | 0.221 | -2.042 | 0.041 | 0.408 | 0.971 |
| depressionYes | 1.295 | 0.096 | 2.699 | 0.007 | 1.072 | 1.560 |
| educationHS/GED | 1.035 | 0.165 | 0.210 | 0.834 | 0.751 | 1.434 |
| educationSome college | 1.371 | 0.165 | 1.914 | 0.056 | 0.995 | 1.899 |
| educationCollege graduate | 1.306 | 0.171 | 1.561 | 0.119 | 0.937 | 1.832 |
| smokingFormer | 1.221 | 0.090 | 2.228 | 0.026 | 1.024 | 1.455 |
| smokingCurrent | 1.228 | 0.115 | 1.784 | 0.075 | 0.979 | 1.536 |
| sexFemale | 1.182 | 0.081 | 2.073 | 0.038 | 1.009 | 1.386 |
Holding all other variables constant, each additional mentally unhealthy day is associated with a 4.6% increase in the odds of frequent physical distress (adjusted odds ratio = 1.046, 95% CI: 1.037 - 1.055). Adults reporting fair or poor general health have 14.5 times the odds of frequent physical distress compared to those reporting excellent or very good health (adjusted odds ratio = 14.535, 95% CI:11.670 - 18.214). Adults who engaged in exercise in the past 30 days have 44% lower odds of frequent physical distress compared to those who did not exercise.
#reduced model (smoking removed)
reduced_mod <- glm(
frequent_distress ~ gen_health + menthlth_days + exercise +
age_group + depression + education +
smoking + sex,
data = brfss_analytic,
family = binomial
)
anova(reduced_mod, mod_logistic, test = "Chisq") |>
kable(digits = 3,
caption = "LR Test: Does adding smoker improve the model?") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
|---|---|---|---|---|
| 7985 | 4440.376 | NA | NA | NA |
| 7982 | 4423.888 | 3 | 16.488 | 0.001 |
H0: The smoking variables do not improve the model fit. All smoking coefficient s= 0.
Ha: At least one smoking coefficient is not zero.
\(Chi^2\) = 16.488 Degrees of freedom = 3 p-value = 0.001
Since the p-value is 0.001, which is below 0.05, we reject the null hypothesis. Adding the smoking variable significantly improves the model.
roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE)
auc(roc_obj)
## Area under the curve: 0.8555
ci.auc(roc_obj)
## 95% CI: 0.8427-0.8683 (DeLong)
The AUC value is 0.8555. This indicates excellent discrimination between individuals with and without frequent physical distress.
hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.8688, df = 8, p-value = 0.5509
H0: The logistic regression model fits the data well, no evidence of poor fit
Ha: the model shows evidence of poor fit.
\(X^2\) = 6.8688 df = 8 p-value = 0.5509
With a p-value greater than 0.05, we fail to reject the null hypothesis. There is no evidence of poor model fit. The ROC/AUC of 0.8555 showed the model has excellent discrimination, meaning it can effectively distinguish individuals with and without frequent physical distress. The Hosmer-Lemeshow test adds that the model has good calibration, meaning the predicted probabilities match the observed outcomes well.
Part 3: Reflection The results of the regression model point out consistent factors associated with the burden of physical health among US adults. General health status emerged as the strongest predictor. Individuals reporting fair or poor health had substantially higher physically unhealthy days and more than 14 times the odds of frequent physical distress in the logistic model. Some predictors such as specific education and income categories were not statistically significant in the logistic model, despite showing small associations in the linear model. A key limitation of the analysis is the cross-sectional nature of BRFSS data, which prevents establishing causality. Unmeasured confounders such as chronic disease severity and access to health care could bias associations and were not included in the models. Comparing the two modeling approaches highlights how each provides distinct but complementary insights. The linear regression model quantifies how predictors relate to the number of physically unhealthy days, offering a continuous measure of burden. In contrast, the logistic regression model focuses on the probability of crossing a clinically meaningful threshold of frequent distress, which is often more interpretable for public health decision making. Linear regression emphasizes variance explained through r-squared, while logistic regression emphasizes discrimination and calibration through measures such as AUC. The two approaches provide a better understanding of the determinants of physical distress in the population. AI was used during the assignment to help understand coding errors and ways to resolve them based on the warnings. I would include the warning from the console and ask what it meant and/or ways to fix the problem.