#Research Question:
What individual, behavioral, and health factors are associated with physical health burden among U.S. adults? How do findings compare when the outcome is modeled as the number of 2 physically unhealthy days (continuous) versus as an indicator of frequent physical distress (binary: 14 or more days in the past 30)?
#Part 0: Data Preparation Step 1: Download Packages
library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)
library(janitor)
library(leaps)
Step 1: Import Data
brfss_2023 <- read_xpt(
"~/Documents/HEPI553/data/LLCP2023.XPT"
) |>
clean_names()
433,323 observations and 350 variables
Step 2: Recode All Variables
brfss_clean <- brfss_2023 |>
mutate(
# Mental health days
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth %in% c(77, 99) ~ NA_real_,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
# Physical unhealth days
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth %in% c(77, 99) ~ NA_real_,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# BMI
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
TRUE ~ as.numeric(bmi5) / 100
),
# Sex
sex = factor(case_when(
sexvar == 1 ~ "Male",
sexvar == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")),
# Exercise
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
exerany2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No","Yes")),
#Depression
depression = factor(case_when(
addepev3 == 1 ~ "Yes",
addepev3 == 2 ~ "No",
addepev3 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No","Yes")),
# Age group
age_group = factor(case_when(
ageg5yr %in% c(1,2,3) ~ "18-34",
ageg5yr %in% c(4,5,6) ~ "35-49",
ageg5yr %in% c(7,8,9) ~ "50-64",
ageg5yr %in% c(10,11,12,13) ~ "65+",
TRUE ~ NA_character_
),
levels = c("18-34","35-49","50-64","65+")),
# Income
income = factor(case_when(
incomg1 %in% c(1,2) ~ "Less than $25k",
incomg1 %in% c(3,4) ~ "$25-$49",
incomg1 %in% c(5,6) ~ "$50-$99",
incomg1 == 7 ~ "$100k+",
incomg1 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $25k", "$25-$49", "$50-$99", "$100k+")),
# Education
education = factor(case_when(
educa %in% c(1, 2,3) ~ "Less than HS",
educa == 4 ~ "HS/GED",
educa == 5 ~ "Some college",
educa == 6 ~ "College graduate",
educa == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than HS","HS/GED","Some college","College graduate")),
# Smoking
smoking = factor(case_when(
smoker3 %in% c(1, 2) ~ "Current smoker",
smoker3 == 3 ~ "Former smoker",
smoker3 == 4 ~ "Never smoker",
smoker3 == 9 ~ NA_character_
),
levels = c("Current smoker","Former smoker","Never smoker")),
# General Health Status
gen_health = factor (case_when(
genhlth %in% c(1,2) ~ "Excellent/Very Good",
genhlth == 3 ~ "Good",
genhlth %in% c(4,5) ~ "Fair/Poor",
genhlth %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
#Marital Status
marital = factor(case_when(
marital %in% c(1,6) ~ "Married/Partnered",
marital %in% c(2, 3, 4) ~ "Previously married",
marital == 5 ~ "Never married",
marital == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Married/Partnered", "Previously married", "Never married")),
) |>
select(menthlth_days, physhlth_days, bmi, sex,depression, exercise, age_group, income, education, smoking, gen_health, marital)
Step 3: Analytic Dataset and Descriptive Summary
Report Missing Observations
cat("Missing values in analytic dataset:", sum(!complete.cases(brfss_clean)), "\n")
## Missing values in analytic dataset: 125095
cat("All models are fit on the same n =", nrow(brfss_clean), "observations.\n")
## All models are fit on the same n = 433323 observations.
cat("Missing menthlth_days:", sum(is.na(brfss_clean$menthlth_days)),
"(", round(mean(is.na(brfss_clean$menthlth_days)) * 100, 1), "%)\n")
## Missing menthlth_days: 8108 ( 1.9 %)
cat("Missing physhlth_days:", sum(is.na(brfss_clean$physhlth_days)),
"(", round(mean(is.na(brfss_clean$physhlth_days)) * 100, 1), "%)\n")
## Missing physhlth_days: 10785 ( 2.5 %)
cat("Missing income:", sum(is.na(brfss_clean$income)),
"(", round(mean(is.na(brfss_clean$income)) * 100, 1), "%)\n")
## Missing income: 86623 ( 20 %)
Drop Missing
set.seed(1220)
brfss_analytic_new <- brfss_clean |>
drop_na() |>
slice_sample(n = 8000)
Reporting Analytic Sample
nrow(brfss_analytic_new)
## [1] 8000
Descriptive Table
brfss_analytic_new |>
select(menthlth_days, physhlth_days, age_group, sex, education, depression, exercise, smoking, bmi, income, gen_health, marital) |>
tbl_summary(
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
age_group ~ "Age (years)",
education ~ "Education level",
depression ~ "Ever told depressive disorder",
exercise ~ "Any physical activity (past 30 days)",
smoking ~ "Smoker Status",
bmi ~ "BMI Status",
income ~ "Income Status",
gen_health ~ "General health status",
marital ~ "Marital status",
sex ~ "Sex"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
bold_labels() |>
italicize_levels() |>
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023**") |>
as_gt()
| Characteristic | N | N = 8,0001 |
|---|---|---|
| Mentally unhealthy days (past 30) | 8,000 | 4.3 (8.2) |
| Physically unhealthy days (past 30) | 8,000 | 4.3 (8.7) |
| Age (years) | 8,000 | |
| 18-34 | 1,294 (16%) | |
| 35-49 | 1,657 (21%) | |
| 50-64 | 2,109 (26%) | |
| 65+ | 2,940 (37%) | |
| Sex | 8,000 | |
| Male | 3,971 (50%) | |
| Female | 4,029 (50%) | |
| Education level | 8,000 | |
| Less than HS | 391 (4.9%) | |
| HS/GED | 1,887 (24%) | |
| Some college | 2,115 (26%) | |
| College graduate | 3,607 (45%) | |
| Ever told depressive disorder | 8,000 | 1,776 (22%) |
| Any physical activity (past 30 days) | 8,000 | 6,157 (77%) |
| Smoker Status | 8,000 | |
| Current smoker | 962 (12%) | |
| Former smoker | 2,230 (28%) | |
| Never smoker | 4,808 (60%) | |
| BMI Status | 8,000 | 28.7 (6.5) |
| Income Status | 8,000 | |
| Less than $25k | 1,090 (14%) | |
| $25-$49 | 1,931 (24%) | |
| $50-$99 | 4,297 (54%) | |
| $100k+ | 682 (8.5%) | |
| General health status | 8,000 | |
| Excellent/Very Good | 3,926 (49%) | |
| Good | 2,648 (33%) | |
| Fair/Poor | 1,426 (18%) | |
| Marital status | 8,000 | |
| Married/Partnered | 4,582 (57%) | |
| Previously married | 2,050 (26%) | |
| Never married | 1,368 (17%) | |
| 1 Mean (SD); n (%) | ||
Part 1: Model Selection for Linear Regression
Step 1: Fit the Maximum Model
Linear Regression Model
model_maxA <- lm(physhlth_days ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + income + gen_health + marital,
data = brfss_analytic_new)
summary(model_maxA)
##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + age_group + sex +
## education + depression + exercise + smoking + bmi + income +
## gen_health + marital, data = brfss_analytic_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.795 -2.839 -1.145 0.690 31.047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.66546 0.64506 4.132 3.63e-05 ***
## menthlth_days 0.18362 0.01135 16.172 < 2e-16 ***
## age_group35-49 0.76935 0.28365 2.712 0.006696 **
## age_group50-64 1.40108 0.27874 5.026 5.11e-07 ***
## age_group65+ 1.72725 0.27852 6.202 5.87e-10 ***
## sexFemale 0.23449 0.16389 1.431 0.152538
## educationHS/GED 0.29371 0.40089 0.733 0.463806
## educationSome college 1.00198 0.40273 2.488 0.012867 *
## educationCollege graduate 1.05262 0.40445 2.603 0.009269 **
## depressionYes 0.77536 0.21528 3.602 0.000318 ***
## exerciseYes -1.93289 0.20408 -9.471 < 2e-16 ***
## smokingFormer smoker -0.02637 0.28445 -0.093 0.926150
## smokingNever smoker -0.46414 0.26617 -1.744 0.081241 .
## bmi -0.01877 0.01287 -1.459 0.144638
## income$25-$49 -1.08440 0.27627 -3.925 8.74e-05 ***
## income$50-$99 -1.52385 0.27607 -5.520 3.50e-08 ***
## income$100k+ -1.31580 0.39821 -3.304 0.000956 ***
## gen_healthGood 1.40982 0.18635 7.565 4.30e-14 ***
## gen_healthFair/Poor 10.06509 0.25239 39.879 < 2e-16 ***
## maritalPreviously married -0.26701 0.20679 -1.291 0.196680
## maritalNever married -0.41244 0.24899 -1.656 0.097666 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared: 0.3258, Adjusted R-squared: 0.3242
## F-statistic: 192.8 on 20 and 7979 DF, p-value: < 2.2e-16
glance(model_maxA) |>
dplyr::select(r.squared, adj.r.squared, sigma, AIC, BIC, df.residual) |>
mutate(across(everything(), \(x) round(x, 3))) |>
kable(caption = "Maximum Model: Fit Statistics") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| r.squared | adj.r.squared | sigma | AIC | BIC | df.residual |
|---|---|---|---|---|---|
| 0.326 | 0.324 | 7.112 | 54115.05 | 54268.77 | 7979 |
Step 2: Best Subsets Regression
best_subsets <- regsubsets(
physhlth_days ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + income + gen_health + marital,
data = brfss_analytic_new,
nvmax = 12, # maximum number of variables to consider
method = "exhaustive"
)
best_summary <- summary(best_subsets)
subset_metrics <- tibble(
p = 1:length(best_summary$adjr2),
`Adj. R²` = best_summary$adjr2,
BIC = best_summary$bic,
Cp = best_summary$cp
)
ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.max(best_summary$adjr2),
linetype = "dashed", color = "tomato") +
labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
theme_minimal(base_size = 12)
ggplot(subset_metrics, aes(x = p, y = BIC)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.min(best_summary$bic),
linetype = "dashed", color = "tomato") +
labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
theme_minimal(base_size = 12)
In 2-3 sentences, describe what you observe: Do the two criteria agree
on the best model size? If not, which criterion favors a simpler
model?
The best subset confirms the analysis by using both AIC and BIC models. The adjusted R^2 reaches it maximum at 12 variables and plateaus, while BIC selects a more parsimonious model with 8.6 variables. As shown, the two criteria agree on the best model size since each show the correlation between the outcome and the variables used.
Step 3: Stepwise Selection Methods
Backward elimination
mod_backward <- step(model_maxA, direction = "backward", trace = 1)
## Start: AIC=31410.04
## physhlth_days ~ menthlth_days + age_group + sex + education +
## depression + exercise + smoking + bmi + income + gen_health +
## marital
##
## Df Sum of Sq RSS AIC
## - marital 2 180 403789 31410
## <none> 403609 31410
## - sex 1 104 403712 31410
## - bmi 1 108 403716 31410
## - smoking 2 347 403956 31413
## - depression 1 656 404265 31421
## - education 3 876 404485 31421
## - income 3 1550 405158 31435
## - age_group 3 2235 405844 31448
## - exercise 1 4537 408146 31497
## - menthlth_days 1 13230 416839 31666
## - gen_health 2 84670 488279 32930
##
## Step: AIC=31409.61
## physhlth_days ~ menthlth_days + age_group + sex + education +
## depression + exercise + smoking + bmi + income + gen_health
##
## Df Sum of Sq RSS AIC
## - bmi 1 98 403887 31410
## <none> 403789 31410
## - sex 1 102 403891 31410
## - smoking 2 345 404134 31412
## - depression 1 637 404427 31420
## - education 3 875 404665 31421
## - income 3 1388 405177 31431
## - age_group 3 3048 406837 31464
## - exercise 1 4538 408327 31497
## - menthlth_days 1 13195 416984 31665
## - gen_health 2 84707 488496 32929
##
## Step: AIC=31409.55
## physhlth_days ~ menthlth_days + age_group + sex + education +
## depression + exercise + smoking + income + gen_health
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
tidy(mod_backward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Backward Elimination Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.8629 | 0.5225 | 3.5653 | 0.0004 | 0.8387 | 2.8871 |
| menthlth_days | 0.1836 | 0.0114 | 16.1748 | 0.0000 | 0.1614 | 0.2059 |
| age_group35-49 | 0.8374 | 0.2703 | 3.0980 | 0.0020 | 0.3075 | 1.3672 |
| age_group50-64 | 1.4778 | 0.2583 | 5.7206 | 0.0000 | 0.9714 | 1.9841 |
| age_group65+ | 1.8366 | 0.2513 | 7.3099 | 0.0000 | 1.3441 | 2.3292 |
| sexFemale | 0.2329 | 0.1633 | 1.4262 | 0.1539 | -0.0872 | 0.5530 |
| educationHS/GED | 0.2576 | 0.4006 | 0.6431 | 0.5202 | -0.5276 | 1.0428 |
| educationSome college | 0.9636 | 0.4025 | 2.3943 | 0.0167 | 0.1747 | 1.7525 |
| educationCollege graduate | 1.0310 | 0.4043 | 2.5502 | 0.0108 | 0.2385 | 1.8236 |
| depressionYes | 0.7471 | 0.2149 | 3.4769 | 0.0005 | 0.3259 | 1.1683 |
| exerciseYes | -1.9048 | 0.2031 | -9.3793 | 0.0000 | -2.3029 | -1.5067 |
| smokingFormer smoker | -0.0377 | 0.2831 | -0.1332 | 0.8940 | -0.5926 | 0.5172 |
| smokingNever smoker | -0.4776 | 0.2647 | -1.8038 | 0.0713 | -0.9965 | 0.0414 |
| income$25-$49 | -1.0324 | 0.2749 | -3.7557 | 0.0002 | -1.5713 | -0.4936 |
| income$50-$99 | -1.3884 | 0.2658 | -5.2239 | 0.0000 | -1.9094 | -0.8674 |
| income$100k+ | -1.1122 | 0.3849 | -2.8897 | 0.0039 | -1.8666 | -0.3577 |
| gen_healthGood | 1.3636 | 0.1839 | 7.4161 | 0.0000 | 1.0032 | 1.7241 |
| gen_healthFair/Poor | 10.0009 | 0.2486 | 40.2245 | 0.0000 | 9.5136 | 10.4883 |
Forward selection
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic_new)
mod_forward <- step(mod_null,
scope = list(lower = mod_null, upper = model_maxA),
direction = "forward", trace = 1)
## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + bmi 1 6253 592434 34442
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940.1 416725 31632
## + exercise 1 6466.7 428198 31849
## + depression 1 6070.1 428595 31857
## + income 3 3231.0 431434 31913
## + smoking 2 1776.3 432889 31938
## + age_group 3 1535.7 433129 31945
## + sex 1 1254.2 433411 31946
## + marital 2 830.3 433835 31956
## + education 3 415.5 434250 31965
## <none> 434665 31967
## + bmi 1 0.2 434665 31969
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5820.8 410904 31521
## + age_group 3 4661.5 412064 31548
## + income 3 1987.0 414738 31600
## + marital 2 1402.5 415322 31609
## + smoking 2 1034.6 415690 31616
## + depression 1 738.1 415987 31620
## + sex 1 605.2 416120 31622
## + education 3 318.9 416406 31632
## <none> 416725 31632
## + bmi 1 3.6 416721 31634
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720.5 407184 31455
## + income 3 1198.4 409706 31504
## + marital 2 1064.8 409839 31505
## + depression 1 739.1 410165 31509
## + smoking 2 706.2 410198 31512
## + education 3 686.9 410217 31514
## + sex 1 449.3 410455 31515
## <none> 410904 31521
## + bmi 1 82.9 410821 31522
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1162.69 406021 31438
## + depression 1 932.70 406251 31438
## + education 3 501.69 406682 31451
## + sex 1 261.03 406923 31452
## + smoking 2 360.51 406823 31452
## <none> 407184 31455
## + bmi 1 83.59 407100 31455
## + marital 2 40.11 407144 31458
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 861.57 405159 31423
## + education 3 953.62 405067 31425
## + smoking 2 305.02 405716 31436
## + sex 1 203.01 405818 31436
## <none> 406021 31438
## + bmi 1 74.91 405946 31438
## + marital 2 153.70 405867 31439
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837.21 404322 31412
## + smoking 2 259.11 404900 31422
## + sex 1 110.16 405049 31422
## + bmi 1 105.12 405054 31423
## <none> 405159 31423
## + marital 2 169.72 404990 31423
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332.18 403990 31410
## + bmi 1 110.19 404212 31412
## <none> 404322 31412
## + sex 1 77.87 404244 31413
## + marital 2 168.26 404154 31413
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 102.92 403887 31410
## <none> 403990 31410
## + bmi 1 98.80 403891 31410
## + marital 2 169.54 403820 31410
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## + bmi 1 97.861 403789 31410
## + marital 2 170.601 403716 31410
tidy(mod_forward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Forward Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.8629 | 0.5225 | 3.5653 | 0.0004 | 0.8387 | 2.8871 |
| gen_healthGood | 1.3636 | 0.1839 | 7.4161 | 0.0000 | 1.0032 | 1.7241 |
| gen_healthFair/Poor | 10.0009 | 0.2486 | 40.2245 | 0.0000 | 9.5136 | 10.4883 |
| menthlth_days | 0.1836 | 0.0114 | 16.1748 | 0.0000 | 0.1614 | 0.2059 |
| exerciseYes | -1.9048 | 0.2031 | -9.3793 | 0.0000 | -2.3029 | -1.5067 |
| age_group35-49 | 0.8374 | 0.2703 | 3.0980 | 0.0020 | 0.3075 | 1.3672 |
| age_group50-64 | 1.4778 | 0.2583 | 5.7206 | 0.0000 | 0.9714 | 1.9841 |
| age_group65+ | 1.8366 | 0.2513 | 7.3099 | 0.0000 | 1.3441 | 2.3292 |
| income$25-$49 | -1.0324 | 0.2749 | -3.7557 | 0.0002 | -1.5713 | -0.4936 |
| income$50-$99 | -1.3884 | 0.2658 | -5.2239 | 0.0000 | -1.9094 | -0.8674 |
| income$100k+ | -1.1122 | 0.3849 | -2.8897 | 0.0039 | -1.8666 | -0.3577 |
| depressionYes | 0.7471 | 0.2149 | 3.4769 | 0.0005 | 0.3259 | 1.1683 |
| educationHS/GED | 0.2576 | 0.4006 | 0.6431 | 0.5202 | -0.5276 | 1.0428 |
| educationSome college | 0.9636 | 0.4025 | 2.3943 | 0.0167 | 0.1747 | 1.7525 |
| educationCollege graduate | 1.0310 | 0.4043 | 2.5502 | 0.0108 | 0.2385 | 1.8236 |
| smokingFormer smoker | -0.0377 | 0.2831 | -0.1332 | 0.8940 | -0.5926 | 0.5172 |
| smokingNever smoker | -0.4776 | 0.2647 | -1.8038 | 0.0713 | -0.9965 | 0.0414 |
| sexFemale | 0.2329 | 0.1633 | 1.4262 | 0.1539 | -0.0872 | 0.5530 |
Stepwise selection
mod_stepwise <- step(mod_null,
scope = list(lower = mod_null, upper = model_maxA),
direction = "both", trace = 1)
## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + bmi 1 6253 592434 34442
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940 416725 31632
## + exercise 1 6467 428198 31849
## + depression 1 6070 428595 31857
## + income 3 3231 431434 31913
## + smoking 2 1776 432889 31938
## + age_group 3 1536 433129 31945
## + sex 1 1254 433411 31946
## + marital 2 830 433835 31956
## + education 3 416 434250 31965
## <none> 434665 31967
## + bmi 1 0 434665 31969
## - gen_health 2 164022 598687 34524
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5821 410904 31521
## + age_group 3 4661 412064 31548
## + income 3 1987 414738 31600
## + marital 2 1402 415322 31609
## + smoking 2 1035 415690 31616
## + depression 1 738 415987 31620
## + sex 1 605 416120 31622
## + education 3 319 416406 31632
## <none> 416725 31632
## + bmi 1 4 416721 31634
## - menthlth_days 1 17940 434665 31967
## - gen_health 2 121660 538384 33677
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720 407184 31455
## + income 3 1198 409706 31504
## + marital 2 1065 409839 31505
## + depression 1 739 410165 31509
## + smoking 2 706 410198 31512
## + education 3 687 410217 31514
## + sex 1 449 410455 31515
## <none> 410904 31521
## + bmi 1 83 410821 31522
## - exercise 1 5821 416725 31632
## - menthlth_days 1 17294 428198 31849
## - gen_health 2 101805 512709 33288
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1163 406021 31438
## + depression 1 933 406251 31438
## + education 3 502 406682 31451
## + sex 1 261 406923 31451
## + smoking 2 361 406823 31451
## <none> 407184 31455
## + bmi 1 84 407100 31455
## + marital 2 40 407144 31458
## - age_group 3 3720 410904 31521
## - exercise 1 4880 412064 31548
## - menthlth_days 1 19957 427141 31835
## - gen_health 2 94875 502059 33126
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 862 405159 31423
## + education 3 954 405067 31425
## + smoking 2 305 405716 31436
## + sex 1 203 405818 31436
## <none> 406021 31438
## + bmi 1 75 405946 31438
## + marital 2 154 405867 31439
## - income 3 1163 407184 31455
## - age_group 3 3685 409706 31504
## - exercise 1 4214 410235 31518
## - menthlth_days 1 18945 424966 31801
## - gen_health 2 87509 493530 32995
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837 404322 31412
## + smoking 2 259 404900 31422
## + sex 1 110 405049 31423
## + bmi 1 105 405054 31423
## <none> 405159 31423
## + marital 2 170 404990 31423
## - depression 1 862 406021 31438
## - income 3 1092 406251 31438
## - age_group 3 3871 409030 31493
## - exercise 1 4219 409378 31504
## - menthlth_days 1 13674 418834 31686
## - gen_health 2 86610 491770 32969
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332 403990 31410
## + bmi 1 110 404212 31412
## <none> 404322 31412
## + sex 1 78 404244 31413
## + marital 2 168 404154 31413
## - education 3 837 405159 31423
## - depression 1 745 405067 31425
## - income 3 1495 405818 31436
## - age_group 3 3558 407880 31476
## - exercise 1 4604 408926 31501
## - menthlth_days 1 13607 417929 31675
## - gen_health 2 86813 491135 32964
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 103 403887 31410
## <none> 403990 31410
## + bmi 1 99 403891 31410
## + marital 2 170 403820 31410
## - smoking 2 332 404322 31412
## - depression 1 691 404681 31421
## - education 3 910 404900 31422
## - income 3 1457 405447 31432
## - age_group 3 3178 407168 31466
## - exercise 1 4502 408492 31496
## - menthlth_days 1 13347 417337 31668
## - gen_health 2 85552 489542 32942
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## + bmi 1 98 403789 31410
## + marital 2 171 403716 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
method_comparison <- tribble(
~Method, ~`Variables selected`, ~`Adj. R²`, ~AIC, ~BIC,
"Maximum model",
length(coef(model_maxA)) - 1,
round(glance(model_maxA)$adj.r.squared, 4),
round(AIC(model_maxA), 1),
round(BIC(model_maxA), 1),
"Backward (AIC)",
length(coef(mod_backward)) - 1,
round(glance(mod_backward)$adj.r.squared, 4),
round(AIC(mod_backward), 1),
round(BIC(mod_backward), 1),
"Forward (AIC)",
length(coef(mod_forward)) - 1,
round(glance(mod_forward)$adj.r.squared, 4),
round(AIC(mod_forward), 1),
round(BIC(mod_forward), 1),
"Stepwise (AIC)",
length(coef(mod_stepwise)) - 1,
round(glance(mod_stepwise)$adj.r.squared, 4),
round(AIC(mod_stepwise), 1),
round(BIC(mod_stepwise), 1)
)
method_comparison |>
kable(caption = "Comparison of Variable Selection Methods") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Method | Variables selected | Adj. R² | AIC | BIC |
|---|---|---|---|---|
| Maximum model | 20 | 0.3242 | 54115.1 | 54268.8 |
| Backward (AIC) | 17 | 0.3239 | 54114.6 | 54247.3 |
| Forward (AIC) | 17 | 0.3239 | 54114.6 | 54247.3 |
| Stepwise (AIC) | 17 | 0.3239 | 54114.6 | 54247.3 |
In 3-5 sentences, compare the results: • Do the three methods agree on the same final model? • Which variables (if any) were excluded by all three methods? • Which variables were retained by all three methods
When using the backward elimination, forward and stepwise, all three methods selected the same model with 17 predictor terms (Adjusted R^2 = 0.024, AIC = 54114.6, BIC = 54247.3). Based on the model, each excluded specific variables but not on all three methods. Variables such as smoking, exercise and mentally unhealthy days were retained by all three methods.
Step 4: Final Model Selection and Interpretation
Choose your final model. State which method(s) guided your choice and why. Display the coefficient table for your final model using tidy() with confidence intervals
My final model is the maximum model in order to see the outcome of physically unhealthy days and its correlation to the predictors used.
mod_max <- lm(physhlth_days ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + income + gen_health + marital,
data = brfss_analytic_new)
tidy(mod_max, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Maximum Model: All Candidate Predictors",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 2.6655 | 0.6451 | 4.1321 | 0.0000 | 1.4010 | 3.9300 |
| menthlth_days | 0.1836 | 0.0114 | 16.1723 | 0.0000 | 0.1614 | 0.2059 |
| age_group35-49 | 0.7693 | 0.2836 | 2.7123 | 0.0067 | 0.2133 | 1.3254 |
| age_group50-64 | 1.4011 | 0.2787 | 5.0264 | 0.0000 | 0.8547 | 1.9475 |
| age_group65+ | 1.7272 | 0.2785 | 6.2016 | 0.0000 | 1.1813 | 2.2732 |
| sexFemale | 0.2345 | 0.1639 | 1.4308 | 0.1525 | -0.0868 | 0.5558 |
| educationHS/GED | 0.2937 | 0.4009 | 0.7326 | 0.4638 | -0.4921 | 1.0796 |
| educationSome college | 1.0020 | 0.4027 | 2.4880 | 0.0129 | 0.2125 | 1.7914 |
| educationCollege graduate | 1.0526 | 0.4044 | 2.6026 | 0.0093 | 0.2598 | 1.8454 |
| depressionYes | 0.7754 | 0.2153 | 3.6016 | 0.0003 | 0.3533 | 1.1974 |
| exerciseYes | -1.9329 | 0.2041 | -9.4711 | 0.0000 | -2.3329 | -1.5328 |
| smokingFormer smoker | -0.0264 | 0.2845 | -0.0927 | 0.9261 | -0.5840 | 0.5312 |
| smokingNever smoker | -0.4641 | 0.2662 | -1.7438 | 0.0812 | -0.9859 | 0.0576 |
| bmi | -0.0188 | 0.0129 | -1.4589 | 0.1446 | -0.0440 | 0.0065 |
| income$25-$49 | -1.0844 | 0.2763 | -3.9251 | 0.0001 | -1.6260 | -0.5428 |
| income$50-$99 | -1.5238 | 0.2761 | -5.5199 | 0.0000 | -2.0650 | -0.9827 |
| income$100k+ | -1.3158 | 0.3982 | -3.3043 | 0.0010 | -2.0964 | -0.5352 |
| gen_healthGood | 1.4098 | 0.1863 | 7.5655 | 0.0000 | 1.0445 | 1.7751 |
| gen_healthFair/Poor | 10.0651 | 0.2524 | 39.8790 | 0.0000 | 9.5703 | 10.5598 |
| maritalPreviously married | -0.2670 | 0.2068 | -1.2912 | 0.1967 | -0.6724 | 0.1384 |
| maritalNever married | -0.4124 | 0.2490 | -1.6565 | 0.0977 | -0.9005 | 0.0756 |
Interpret the coefficients for at least three predictors in plain language, including units and “holding all other variables constant” language. Include at least one continuous and one categorical predictor.
Menthlth_days(continuous predictor) = each additional day of mentally unhealthy days is associated with an estimated 0.184 additional physically unhealthy days on average, holding age, sex, education, depression, smoking, BMI, income, gen health status and marital status constant.
Depression (continuous predictor) = each one-unit increase in the depression is associated with 0.775 fewer physically unhealthy days on average, holding all other variables constant.
Sex (categorical predictor) = Female vs Male = compared to males (reference group), females report an estimated 0.235 more physically unhealthy days on average, holding all other variables constant.
Step 5: LINE Assumption Check
par(mfrow = c(2, 2))
plot(model_maxA)
par(mfrow = c(1, 1))
The plot shows there is evidence of non-linearity variance since the plots show a random scatter around 0 and there is no specific pattern.
The residuals do follow a normal distribution until the value is 2, then it starts to deviate in an increase direction.
Yes, the spread of residuals roughly constant across the fitted values. As shows, many of the dots follow the flat red line and is a constant spread.
Yes, there are highly influential observations as the x-axis increases. There are observations at 0.008, 0.010 and 0.014.
Write a brief conclusion (2-3 sentences): Are the LINE assumptions reasonably satisfied? What violations, if any, do you observe?
Given the line assumptions represented above, they are all reasonably stratified. Possibly violations that could occur include normality depending on the size of the sample, which can lead to biases and homoscedasticity.
#Part 2: Logistic Regression
You will now reframe the research question using a binary outcome. The CDC defines frequent physical distress as reporting 14 or more physically unhealthy days in the past 30 days.
Step 1: Create the Binary Outcome
brfss_analytic_new <- brfss_analytic_new %>%
mutate(
frequent_distress = case_when(
physhlth_days >= 14 ~ "Yes",
physhlth_days < 14 ~ "No",
TRUE ~ NA_character_
),
frequent_distress = factor(frequent_distress,
levels = c ("No", "Yes"))
)
tab <- table(brfss_analytic_new$frequent_distress)
freq_table <- data.frame(
"Frequent Distress" = names(tab),
N = as.vector(tab),
Percent = paste0(round(100 * prop.table(tab), 1), "%")
)
knitr::kable(
freq_table,
col.names = c("Frequent Distress", "N", "%"),
caption = "Outcome Distribution")
| Frequent Distress | N | % |
|---|---|---|
| No | 6948 | 86.9% |
| Yes | 1052 | 13.2% |
In 1-2 sentences, comment on the balance of the outcome
When looking at frequent distress, the individuals who reported not being distressed were significantly higher compared to the individuals who did report distress. This means that the individuals who did not experience distress had few physically unhealthy days.
Step 2: Simple (Unadjusted) Logistic Regression
Choose one predictor from your final linear model that you expect to have a strong association with physical distress. State why you chose it
I chose the predictor BMI from my final linear model because the higher an individuals BMI is, the higher the chance of them experiencing physical distress. This can be due to physical health barriers and emotional challenges.
mod_simple <- glm(frequent_distress ~ bmi,
data = brfss_analytic_new, family = binomial)
summary(mod_simple)
##
## Call:
## glm(formula = frequent_distress ~ bmi, family = binomial, data = brfss_analytic_new)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.993253 0.142112 -21.063 < 2e-16 ***
## bmi 0.037759 0.004624 8.165 3.21e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 6164.3 on 7998 degrees of freedom
## AIC: 6168.3
##
## Number of Fisher Scoring iterations: 4
mod_simple <- glm(frequent_distress ~ bmi, data = brfss_analytic_new,
family = binomial(link = "logit"))
tidy(mod_simple, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Simple Logistic Regression: Frequent Distress ~ BMI (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.050 | 0.142 | -21.063 | 0 | 0.038 | 0.066 |
| bmi | 1.038 | 0.005 | 8.165 | 0 | 1.029 | 1.048 |
exp(coef(mod_simple)) # Odds ratio
## (Intercept) bmi
## 0.0501241 1.0384807
exp(confint(mod_simple)) # 95% CI for OR
## 2.5 % 97.5 %
## (Intercept) 0.03792694 0.06621861
## bmi 1.02906666 1.04790124
Interpret the odds ratio in plain language.
For every one-unit increase in BMI, the odds of frequent physical distress are multiplied by 1.038, 95% CI [1.029, 1.048].
State whether the association is statistically significant at alpha = 0.05 and how you know (Wald test p-value or CI excluding 1.0)
Since the p-value = 0, this means it is less than 0.05. Therefore, it is statistically significant and there is a positive correlation between BMI and physical distress.
Step 3: Multiple Logistic Regression
mod_logistic <- glm(frequent_distress ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + income + gen_health + marital,
data = brfss_analytic_new, family = binomial)
summary(mod_logistic)
##
## Call:
## glm(formula = frequent_distress ~ menthlth_days + age_group +
## sex + education + depression + exercise + smoking + bmi +
## income + gen_health + marital, family = binomial, data = brfss_analytic_new)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.431364 0.297352 -11.540 < 2e-16 ***
## menthlth_days 0.044787 0.004358 10.277 < 2e-16 ***
## age_group35-49 0.512672 0.162401 3.157 0.00160 **
## age_group50-64 0.767269 0.154816 4.956 7.20e-07 ***
## age_group65+ 0.964882 0.155220 6.216 5.09e-10 ***
## sexFemale 0.176389 0.081382 2.167 0.03020 *
## educationHS/GED 0.049334 0.165069 0.299 0.76504
## educationSome college 0.331979 0.165058 2.011 0.04430 *
## educationCollege graduate 0.283640 0.171335 1.655 0.09783 .
## depressionYes 0.267888 0.095954 2.792 0.00524 **
## exerciseYes -0.586264 0.085768 -6.835 8.17e-12 ***
## smokingFormer smoker -0.007660 0.121363 -0.063 0.94967
## smokingNever smoker -0.208048 0.116374 -1.788 0.07381 .
## bmi -0.004822 0.005556 -0.868 0.38546
## income$25-$49 -0.215375 0.111712 -1.928 0.05386 .
## income$50-$99 -0.501752 0.118036 -4.251 2.13e-05 ***
## income$100k+ -0.541456 0.227454 -2.381 0.01729 *
## gen_healthGood 0.895044 0.116270 7.698 1.38e-14 ***
## gen_healthFair/Poor 2.696530 0.115309 23.385 < 2e-16 ***
## maritalPreviously married -0.138611 0.096205 -1.441 0.14965
## maritalNever married -0.143542 0.127789 -1.123 0.26132
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4420.7 on 7979 degrees of freedom
## AIC: 4462.7
##
## Number of Fisher Scoring iterations: 6
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.032 | 0.297 | -11.540 | 0.000 | 0.018 | 0.058 |
| menthlth_days | 1.046 | 0.004 | 10.277 | 0.000 | 1.037 | 1.055 |
| age_group35-49 | 1.670 | 0.162 | 3.157 | 0.002 | 1.217 | 2.302 |
| age_group50-64 | 2.154 | 0.155 | 4.956 | 0.000 | 1.596 | 2.929 |
| age_group65+ | 2.624 | 0.155 | 6.216 | 0.000 | 1.944 | 3.573 |
| sexFemale | 1.193 | 0.081 | 2.167 | 0.030 | 1.017 | 1.400 |
| educationHS/GED | 1.051 | 0.165 | 0.299 | 0.765 | 0.762 | 1.456 |
| educationSome college | 1.394 | 0.165 | 2.011 | 0.044 | 1.011 | 1.932 |
| educationCollege graduate | 1.328 | 0.171 | 1.655 | 0.098 | 0.952 | 1.864 |
| depressionYes | 1.307 | 0.096 | 2.792 | 0.005 | 1.082 | 1.576 |
| exerciseYes | 0.556 | 0.086 | -6.835 | 0.000 | 0.470 | 0.658 |
| smokingFormer smoker | 0.992 | 0.121 | -0.063 | 0.950 | 0.783 | 1.260 |
| smokingNever smoker | 0.812 | 0.116 | -1.788 | 0.074 | 0.647 | 1.022 |
| bmi | 0.995 | 0.006 | -0.868 | 0.385 | 0.984 | 1.006 |
| income$25-$49 | 0.806 | 0.112 | -1.928 | 0.054 | 0.648 | 1.004 |
| income$50-$99 | 0.605 | 0.118 | -4.251 | 0.000 | 0.481 | 0.763 |
| income$100k+ | 0.582 | 0.227 | -2.381 | 0.017 | 0.368 | 0.899 |
| gen_healthGood | 2.447 | 0.116 | 7.698 | 0.000 | 1.952 | 3.081 |
| gen_healthFair/Poor | 14.828 | 0.115 | 23.385 | 0.000 | 11.863 | 18.648 |
| maritalPreviously married | 0.871 | 0.096 | -1.441 | 0.150 | 0.720 | 1.050 |
| maritalNever married | 0.866 | 0.128 | -1.123 | 0.261 | 0.673 | 1.111 |
Interpret the adjusted odds ratios for at least three predictors in plain language, using “holding all other variables constant” language. Include at least one continuous and one categorical predictor.
EducationHS/GED (continuous predictor) = each additional day of educationHS/GED days is associated with an estimated 1.051 additional physically unhealthy days on average, holding age, sex, mentally unhealthy days,depression, smoking, BMI, income, gen health status and marital status constant.
Exercise(continuous predictor) = each one-unit increase in the exercise is associated with 0.556 fewer physically unhealthy days on average, holding all other variables constant.
Sex (categorical predictor) = Female vs Male = compared to males (reference group), females report an estimated 1.193 more physically unhealthy days on average, holding all other variables constant.
Step 4: Likelihood Ratio Test
Group: All education levels
mod_reduced_education <- glm(frequent_distress ~ menthlth_days + age_group + sex + education + depression + exercise + smoking + bmi + gen_health + marital, data =
brfss_analytic_new, family = binomial)
# Likelihood ratio test
lr_test <- anova(mod_reduced_education, mod_logistic, test = "Chisq")
lr_test |>
kable(digits = 3, caption = "Likelihood Ratio Test: Full Model vs. Null") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
|---|---|---|---|---|
| 7982 | 4439.742 | NA | NA | NA |
| 7979 | 4420.659 | 3 | 19.082 | 0 |
• State the null and alternative hypotheses
H0: Group of predictors is associated with physical unhealthy days H1: Group of predictors is not associated with physical unhealthy days
• Report: test statistic (deviance) = 19.082 degrees of freedom = 3 p-value = 0
• Conclude at alpha = 0.05: Does this group of predictors significantly improve the model?
Since the p-value is less than 0.05, this group of predictors are significantly improve the model for the outcome of physically unhealthy days.
Step 5: ROC Curve and AUC
roc_obj <- roc(brfss_analytic_new$frequent_distress, fitted(mod_logistic))
plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE)
auc(roc_obj)
## Area under the curve: 0.8555
ci.auc(roc_obj)
## 95% CI: 0.8426-0.8683 (DeLong)
Interpret the AUC: What does this value tell you about the model’s ability to discriminate between individuals with and without frequent physical distress? Use the following guide:
o 0.5 = no discrimination (coin flip) o 0.5-0.7 = poor discrimination o 0.7-0.8 = acceptable discrimination o 0.8-0.9 = excellent discrimination o 0.9+ = outstanding discrimination
The model tells me that since the AUC = 0.8555, there is excellent discrimination between individuals with and without frequent physical distress.
Step 6: Hosmer-Lemeshow Goodness-of-Fit Test
hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.5773, df = 8, p-value = 0.6945
• State the null hypothesis (the model fits the data well) and alternative hypothesis (the model does not fit the data well)
H0: The model fits the data for physically unhealthy days H1: The model does not fit the data for physically unhealthy days
• Report: test statistic = 5.5773 degrees of freedom = 8 p-value = 0.6945
• Interpret: At alpha = 0.05, is there evidence of poor model fit?
Since the p-value is greater than 0.05, the model does fit in some regions of predicted probability.
• In 1-2 sentences, discuss how the Hosmer-Lemeshow result complements the ROC/AUC finding.
The Hosmer-Lemeshow result complements the ROC/AUC finding by assessing the agreement between the predicted and observed rates within the predicted probability. With this data, it can also help determine the discrimination and the curve of the plot.
Part 3: Reflection
A. Statistical Insights (5 points): What do your results suggest about the factors associated with physical health burden among U.S. adults? Which predictors were the strongest in both the linear and logistic models? Were any predictors significant in one model but not the other? What are the key limitations of using cross-sectional survey data for this type of analysis? Name at least two specific potential confounders not measured in your model.
I would suggest that there are certain factors that contribute to the physical health burden among U.S. adults. For example, mentally unhealthy days, individuals who are 35-49 years old and people who have a poor general health status are at a higher chance of developing physically unhealthy days. The strongest predictors in the linear models was BMI, mentally unhealthy days, individuals ages 35-65+. The predictors remained significant in the models, but it varied on the p-value. Limitations using a cross-sectional survey for this type of analysis is that it might be more challenging to identify the temporal relationship based on the physical unhealthy days when considering other factors such as environmental contributors. Biases can also be a limitation depending on the sample size. One potential confounder not measured in the model is alcohol consumption. This can have an impact on depression, exercise, BMI, mentally unhealthy days and general health status. Another potential confounder could be ethnicity. Ethnicity can play a crucial role in the development of physically unhealthy days. Based on anindividual’s ethnicity, this can lead to factors such as lack of resources and services available to achieve healthy physical days.
B. Linear versus Logistic Regression (5 points): Compare what the linear regression model tells you versus what the logistic regression model tells you about the same research question. What information does each approach provide that the other does not? In what situations would you prefer one approach over the other? How do the model selection criteria differ between the two frameworks (for example, R-squared versus AUC)?
When comparing the linear regression model from the logistic regression model, the linear model is able to analyze and identify the significance of one predictor while the logistic regression takes into account multiple factors for the outcome of physically unhealthy days among U.S. adults. In the models, we see that linear shows how continuous predictors have an impact on the outcome while logistic uses binary outcomes. The linear regression allows us to predict the values of physically unhealthy days while also testing the hypotheses. The logistic regression allows us to analyze the adjusted odds ratios in order to control for potential confounders. I would prefer the logistic regression if I am using certain predictors and outcome variables that are both categorical. I would be able to determine the probability of an event occurring based on the correlation between predictors. The model selection criteria differ between two frameworks because each has its own methods, limitations and violations based on the model used. When using R-squared, information will be coded and analyzed differently compared to when using the AUC plot.
C. AI Transparency (5 points): Describe any parts of this assignment where you sought AI assistance. Include the specific prompts you used and how you verified the correctness of the output. If you did not use AI, state this explicitly.
I had used AI when it came to minor grammatical errors such as using too many commas, brackets or leaving certain syntax out. For creating the binary outcome, I had a hard time trying to initially write the code. I had referenced the lecture material but AI recommended how I should format the beginning. It also recommended to use names(tab) instead of levels. I used CHAT in multiple logistic regression because I was having problems why frequent distress variable was not in my data set. CHAT recommended I had to go back a few steps to recheck my codes. For the likelihood ratio test, I had trouble writing my mod_reduced section, I used reference from the lecture materials but I needed to change the formatting in my “glm” since I used all education levels.