knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.width = 10,
fig.height = 6
)library(tidyverse)
library(knitr)
library(haven)
library(janitor)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(ResourceSelection) # for Hosmer-Lemeshow
library(pROC) # for ROC/AUC
library(performance) # for model performance metrics
library(sjPlot)
library(modelsummary)
library(leaps)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))brfss_full23 <- read_xpt(
"C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2023.XPT"
) |>
clean_names()## [1] 433323 350
These variables were selected based on representation of behavioral, biological, and mental health factors, along with sociodemographic characteristics. These variables are well established determines of physicals health and together capture multiple dimensions that may influence physically unhealthy days.
brfss_var <- brfss_full23 |>
mutate(
# Outcome
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth %in% c(77, 99) ~ NA_real_,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# Continuous
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth %in% c(77, 99) ~ NA_real_,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
bmi = case_when(
`bmi5` == 9999 ~ NA_real_,
TRUE ~ `bmi5` / 100
),
# Binary
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
exercise = case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
exerany2 %in% c(7, 9) ~ NA_character_
) |> factor(levels = c("No", "Yes")),
# Categorical
age_group = case_when(
`ageg5yr` %in% 1:3 ~ "18-34",
`ageg5yr` %in% 4:6 ~ "35-49",
`ageg5yr` %in% 7:9 ~ "50-64",
`ageg5yr` %in% 10:13 ~ "65+",
TRUE ~ NA_character_
) |> factor(levels = c("18-34", "35-49", "50-64", "65+")),
income = case_when(
`incomg1` %in% 1:2 ~ "Less than $25K",
`incomg1` %in% 3:4 ~ "$25K-$49K",
`incomg1` %in% 5:6 ~ "$50K-$99K",
`incomg1` == 7 ~ "$100K+",
TRUE ~ NA_character_
) |> factor(levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),
education = case_when(
educa %in% 1:3 ~ "Less than HS",
educa == 4 ~ "HS/GED",
educa == 5 ~ "Some college",
educa == 6 ~ "College graduate",
TRUE ~ NA_character_
) |> factor(levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),
smoking = case_when(
`smoker3` %in% 1:2 ~ "Current",
`smoker3` == 3 ~ "Former",
`smoker3` == 4 ~ "Never",
TRUE ~ NA_character_
) |> factor(levels = c("Never", "Former", "Current"))
)brfss_var |>
select(physhlth_days,menthlth_days,bmi,age_group,income,sex,
education,smoking,exercise
) |>
tbl_summary(
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body mass index",
age_group ~ "Age group",
income ~ "Household income",
sex ~ "Sex",
exercise ~ "Any physical activity (past 30 days)",
education ~ "Highest level of education completed",
smoking ~ "Smoking status (4 levels)"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
bold_labels() |>
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023**")| Characteristic | N | N = 433,3231 |
|---|---|---|
| Physically unhealthy days (past 30) | 422,538 | 4.5 (8.8) |
| Mentally unhealthy days (past 30) | 425,215 | 4.4 (8.3) |
| Body mass index | 392,788 | 28.5 (6.5) |
| Age group | 425,544 | |
| 18-34 | 72,330 (17%) | |
| 35-49 | 82,686 (19%) | |
| 50-64 | 107,484 (25%) | |
| 65+ | 163,044 (38%) | |
| Household income | 346,700 | |
| Less than $25K | 50,256 (14%) | |
| $25K-$49K | 86,010 (25%) | |
| $50K-$99K | 183,664 (53%) | |
| $100K+ | 26,770 (7.7%) | |
| Sex | 433,323 | |
| Male | 203,782 (47%) | |
| Female | 229,541 (53%) | |
| Highest level of education completed | 430,998 | |
| Less than HS | 25,172 (5.8%) | |
| HS/GED | 106,613 (25%) | |
| Some college | 114,346 (27%) | |
| College graduate | 184,867 (43%) | |
| Smoking status (4 levels) | 410,261 | |
| Never | 251,981 (61%) | |
| Former | 113,134 (28%) | |
| Current | 45,146 (11%) | |
| Any physical activity (past 30 days) | 432,072 | 325,227 (75%) |
| 1 Mean (SD); n (%) | ||
#Report the number and percentage of missing observations
total_n <- nrow(brfss_var)
# Missing summary
missing_summary <- brfss_var |>
summarise(
menthlth_miss_n = sum(is.na(physhlth_days)),
menthlth_miss_pct = mean(is.na(physhlth_days)) * 100,
bmi_miss_n = sum(is.na(menthlth_days)),
bmi_miss_pct = mean(is.na(menthlth_days)) * 100,
smoking_miss_n = sum(is.na(bmi)),
smoking_miss_pct = mean(is.na(bmi)) * 100
)
missing_summary## # A tibble: 1 × 6
## menthlth_miss_n menthlth_miss_pct bmi_miss_n bmi_miss_pct smoking_miss_n
## <int> <dbl> <int> <dbl> <int>
## 1 10785 2.49 8108 1.87 40535
## # ℹ 1 more variable: smoking_miss_pct <dbl>
set.seed(1220)
brfss_analytic <- brfss_var |>
select(physhlth_days,menthlth_days,bmi,age_group,income,sex,
education,smoking,exercise) |>
drop_na() |>
slice_sample(n = 8000)
nrow(brfss_analytic)## [1] 8000
brfss_analytic |>
select(physhlth_days,menthlth_days,bmi,age_group,income,sex,
education,smoking,exercise) |>
tbl_summary(
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body mass index",
age_group ~ "Age group",
sex ~ "Sex",
income ~ "Household income",
exercise ~ "Any physical activity (past 30 days)",
education ~ "Highest level of education completed",
smoking ~ "Smoking status (4 levels)"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
# add_overall() |>
bold_labels() |>
#italicize_levels() |>
modify_caption("**Table 1. Descriptive Statistics by Sex — BRFSS 2023 Analytic Sample (n = 8,000)**")| Characteristic | N | N = 8,0001 |
|---|---|---|
| Physically unhealthy days (past 30) | 8,000 | 4.4 (8.7) |
| Mentally unhealthy days (past 30) | 8,000 | 4.3 (8.1) |
| Body mass index | 8,000 | 28.7 (6.5) |
| Age group | 8,000 | |
| 18-34 | 1,277 (16%) | |
| 35-49 | 1,665 (21%) | |
| 50-64 | 2,055 (26%) | |
| 65+ | 3,003 (38%) | |
| Household income | 8,000 | |
| Less than $25K | 1,048 (13%) | |
| $25K-$49K | 1,938 (24%) | |
| $50K-$99K | 4,376 (55%) | |
| $100K+ | 638 (8.0%) | |
| Sex | 8,000 | |
| Male | 3,936 (49%) | |
| Female | 4,064 (51%) | |
| Highest level of education completed | 8,000 | |
| Less than HS | 376 (4.7%) | |
| HS/GED | 1,827 (23%) | |
| Some college | 2,138 (27%) | |
| College graduate | 3,659 (46%) | |
| Smoking status (4 levels) | 8,000 | |
| Never | 4,812 (60%) | |
| Former | 2,262 (28%) | |
| Current | 926 (12%) | |
| Any physical activity (past 30 days) | 8,000 | 6,240 (78%) |
| 1 Mean (SD); n (%) | ||
#Your goal is to identify the best linear regression model for predicting physhlth_days (continuous) from your chosen set of at least 8 predictors
mod_max <- lm(physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
education + smoking + exercise,
data = brfss_analytic)
tidy(mod_max, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Maximum Model: All Candidate Predictors",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.7849 | 0.6589 | 8.7800 | 0.0000 | 4.4934 | 7.0765 |
| menthlth_days | 0.3040 | 0.0114 | 26.5864 | 0.0000 | 0.2816 | 0.3264 |
| bmi | 0.0481 | 0.0140 | 3.4364 | 0.0006 | 0.0206 | 0.0755 |
| age_group35-49 | 1.3446 | 0.2986 | 4.5023 | 0.0000 | 0.7592 | 1.9300 |
| age_group50-64 | 2.6421 | 0.2876 | 9.1876 | 0.0000 | 2.0784 | 3.2058 |
| age_group65+ | 2.8790 | 0.2749 | 10.4742 | 0.0000 | 2.3402 | 3.4178 |
| income$25K-$49K | -2.6229 | 0.3074 | -8.5332 | 0.0000 | -3.2254 | -2.0203 |
| income$50K-$99K | -3.2345 | 0.2961 | -10.9223 | 0.0000 | -3.8150 | -2.6540 |
| income$100K+ | -3.8191 | 0.4328 | -8.8252 | 0.0000 | -4.6674 | -2.9708 |
| sexFemale | 0.0309 | 0.1794 | 0.1720 | 0.8635 | -0.3209 | 0.3826 |
| educationHS/GED | -1.1996 | 0.4510 | -2.6598 | 0.0078 | -2.0836 | -0.3155 |
| educationSome college | -0.7213 | 0.4530 | -1.5924 | 0.1113 | -1.6093 | 0.1666 |
| educationCollege graduate | -1.2362 | 0.4538 | -2.7240 | 0.0065 | -2.1259 | -0.3466 |
| smokingFormer | 0.6796 | 0.2059 | 3.3008 | 0.0010 | 0.2760 | 1.0832 |
| smokingCurrent | 1.1538 | 0.2971 | 3.8831 | 0.0001 | 0.5713 | 1.7363 |
| exerciseYes | -3.4495 | 0.2254 | -15.3021 | 0.0000 | -3.8913 | -3.0076 |
glance(mod_max) |>
dplyr::select(r.squared, adj.r.squared, sigma, AIC, BIC, df.residual) |>
mutate(across(everything(), \(x) round(x, 3))) |>
kable(caption = "Maximum Model: Fit Statistics") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| r.squared | adj.r.squared | sigma | AIC | BIC | df.residual |
|---|---|---|---|---|---|
| 0.184 | 0.183 | 7.859 | 55707.42 | 55826.2 | 7984 |
Report and interpret: R-squared, Adjusted R-squared, AIC (using AIC()), and BIC (using BIC() R-squared = 0.184 The model explains about 18.4% of the variation in physically unhealthy days.
Adjusted R-squared = 0.183 After adjusting for the number of predictors, the model explains about 18.3% of the variation. There is almost identical values for r-squared and adjusted r-squared showing that adding predictors did not meaningfully inflate model fit.
AIC = 55,707.42 AIC is a measure of model quality that balances goodness of fit with model complexity. This value is not meaningful on its own, but it is used to compare models—lower AIC indicates a better model when comparing alternative specifications.
BIC = 55826.2 BIC is similar to AIC but penalizes model complexity more strongly. Like AIC, it is only useful for comparison across models. A lower BIC would indicate a more a simpler model with adequate fit. In this case, BIC is greater than AIC.
#Step 2:Best Subsets Regression
best_subsets <- regsubsets(
physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
education + smoking + exercise,
data = brfss_analytic,
nvmax = 17, # maximum number of variables to consider
method = "exhaustive"
)
best_summary <- summary(best_subsets)
subset_metrics <- tibble(
p = 1:length(best_summary$adjr2),
`Adj. R²` = best_summary$adjr2,
BIC = best_summary$bic,
Cp = best_summary$cp
)
p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.max(best_summary$adjr2),
linetype = "dashed", color = "tomato") +
labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
theme_minimal(base_size = 12)
p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.min(best_summary$bic),
linetype = "dashed", color = "tomato") +
labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
theme_minimal(base_size = 12)
gridExtra::grid.arrange(p1, p2, ncol = 2)## Best model by Adj. R²: 14 variables
## Best model by BIC: 11 variables
# Show which variables are in the BIC-best model
best_bic_idx <- which.min(best_summary$bic)
best_vars <- names(which(best_summary$which[best_bic_idx, -1]))
cat("\nVariables in BIC-best model:\n")##
## Variables in BIC-best model:
## menthlth_days
## bmi
## age_group35-49
## age_group50-64
## age_group65+
## income$25K-$49K
## income$50K-$99K
## income$100K+
## smokingFormer
## smokingCurrent
## exerciseYes
In 2-3 sentences, describe what you observe: Do the two criteria agree on the best model size? If not, which criterion favors a simpler model?
The two criteria do not agree on the best model size. The adjusted r-squared model has a model size of about 12, while the BIC graph has a model size of about 9. The adjusted r-squared model continues to increase slightly and then levels off with more variables, while the BIC reaches its minimum and then worsens, showing that a smaller model would have been preferred. The BIC favors a simpler model.
#Step 3: Stepwise Selection Methods
#Backward elimination
mod_max <- lm(physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
education + smoking + exercise,
data = brfss_analytic)
cat("=== BACKWARD ELIMINATION ===\n\n")## === BACKWARD ELIMINATION ===
## Step 1: Maximum model
## Variables: menthlth_days, bmi, age_group35-49, age_group50-64, age_group65+, income$25K-$49K, income$50K-$99K, income$100K+, sexFemale, educationHS/GED, educationSome college, educationCollege graduate, smokingFormer, smokingCurrent, exerciseYes
# Show p-values for the maximum model
pvals <- tidy(mod_back) |>
filter(term != "(Intercept)") |>
arrange(desc(p.value)) |>
dplyr::select(term, estimate, p.value) |>
mutate(across(where(is.numeric), \(x) round(x, 4)))
pvals |>
head(5) |>
kable(caption = "Maximum Model: Variables Sorted by p-value (Highest First)") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| term | estimate | p.value |
|---|---|---|
| sexFemale | 0.0309 | 0.8635 |
| educationSome college | -0.7213 | 0.1113 |
| educationHS/GED | -1.1996 | 0.0078 |
| educationCollege graduate | -1.2362 | 0.0065 |
| smokingFormer | 0.6796 | 0.0010 |
# Automated backward elimination using AIC
mod_backward <- step(mod_max, direction = "backward", trace = 1)## Start: AIC=33002.4
## physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
## education + smoking + exercise
##
## Df Sum of Sq RSS AIC
## - sex 1 2 493117 33000
## <none> 493115 33002
## - education 3 747 493862 33009
## - bmi 1 729 493844 33012
## - smoking 2 1293 494408 33019
## - income 3 7823 500938 33122
## - age_group 3 8240 501355 33129
## - exercise 1 14462 507577 33232
## - menthlth_days 1 43656 536771 33679
##
## Step: AIC=33000.43
## physhlth_days ~ menthlth_days + bmi + age_group + income + education +
## smoking + exercise
##
## Df Sum of Sq RSS AIC
## <none> 493117 33000
## - education 3 745 493862 33007
## - bmi 1 728 493845 33010
## - smoking 2 1295 494412 33017
## - income 3 7909 501026 33122
## - age_group 3 8302 501418 33128
## - exercise 1 14480 507597 33230
## - menthlth_days 1 44141 537257 33684
tidy(mod_backward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Backward Elimination Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.8000 | 0.6530 | 8.8821 | 0.0000 | 4.5200 | 7.0800 |
| menthlth_days | 0.3042 | 0.0114 | 26.7352 | 0.0000 | 0.2819 | 0.3265 |
| bmi | 0.0480 | 0.0140 | 3.4339 | 0.0006 | 0.0206 | 0.0754 |
| age_group35-49 | 1.3477 | 0.2981 | 4.5215 | 0.0000 | 0.7634 | 1.9320 |
| age_group50-64 | 2.6452 | 0.2870 | 9.2171 | 0.0000 | 2.0826 | 3.2077 |
| age_group65+ | 2.8828 | 0.2740 | 10.5221 | 0.0000 | 2.3457 | 3.4198 |
| income$25K-$49K | -2.6241 | 0.3073 | -8.5398 | 0.0000 | -3.2264 | -2.0217 |
| income$50K-$99K | -3.2385 | 0.2952 | -10.9701 | 0.0000 | -3.8172 | -2.6598 |
| income$100K+ | -3.8271 | 0.4303 | -8.8949 | 0.0000 | -4.6705 | -2.9837 |
| educationHS/GED | -1.1978 | 0.4508 | -2.6567 | 0.0079 | -2.0815 | -0.3140 |
| educationSome college | -0.7178 | 0.4525 | -1.5863 | 0.1127 | -1.6048 | 0.1692 |
| educationCollege graduate | -1.2319 | 0.4531 | -2.7188 | 0.0066 | -2.1201 | -0.3437 |
| smokingFormer | 0.6768 | 0.2052 | 3.2977 | 0.0010 | 0.2745 | 1.0791 |
| smokingCurrent | 1.1503 | 0.2964 | 3.8807 | 0.0001 | 0.5692 | 1.7313 |
| exerciseYes | -3.4505 | 0.2253 | -15.3127 | 0.0000 | -3.8922 | -3.0088 |
#Forward selection
brfss_analytic <- na.omit(brfss_analytic)
# Null model
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)
# Full model (DO NOT redefine if already created earlier — but kept here for safety)
mod_max <- lm(
physhlth_days ~ menthlth_days + bmi + age_group + income + sex +
education + smoking + exercise,
data = brfss_analytic
)
cat("=== FORWARD SELECTION ===\n\n")## === FORWARD SELECTION ===
## Step 1: Null model
## Variables: Intercept only
# Automated forward selection using AIC
mod_forward <- step(
mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "forward",
trace = 1
)## Start: AIC=34603.45
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 59071 545563 33783
## + exercise 1 37706 566927 34090
## + income 3 32443 572191 34168
## + smoking 2 12018 592616 34447
## + education 3 11873 592761 34451
## + age_group 3 5515 599119 34536
## + bmi 1 4732 599901 34543
## + sex 1 1559 603075 34585
## <none> 604634 34603
##
## Step: AIC=33783.01
## physhlth_days ~ menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 28346.8 517216 33358
## + income 3 19686.5 525876 33495
## + age_group 3 14455.2 531108 33574
## + education 3 8240.9 537322 33667
## + smoking 2 6070.4 539492 33697
## + bmi 1 2821.0 542742 33744
## + sex 1 266.4 545296 33781
## <none> 545563 33783
##
## Step: AIC=33358.15
## physhlth_days ~ menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + income 3 11764.1 505452 33180
## + age_group 3 9921.3 507295 33209
## + smoking 2 3952.9 513263 33301
## + education 3 3409.0 513807 33311
## + bmi 1 753.5 516463 33348
## <none> 517216 33358
## + sex 1 106.7 517109 33359
##
## Step: AIC=33180.09
## physhlth_days ~ menthlth_days + exercise + income
##
## Df Sum of Sq RSS AIC
## + age_group 3 9400.2 496052 33036
## + smoking 2 2483.5 502968 33145
## + bmi 1 781.6 504670 33170
## + education 3 937.1 504515 33171
## <none> 505452 33180
## + sex 1 6.7 505445 33182
##
## Step: AIC=33035.91
## physhlth_days ~ menthlth_days + exercise + income + age_group
##
## Df Sum of Sq RSS AIC
## + smoking 2 1427.13 494625 33017
## + education 3 1013.78 495038 33026
## + bmi 1 666.76 495385 33027
## <none> 496052 33036
## + sex 1 14.93 496037 33038
##
## Step: AIC=33016.86
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking
##
## Df Sum of Sq RSS AIC
## + bmi 1 762.78 493862 33007
## + education 3 779.94 493845 33010
## <none> 494625 33017
## + sex 1 0.00 494625 33019
##
## Step: AIC=33006.52
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking + bmi
##
## Df Sum of Sq RSS AIC
## + education 3 745.36 493117 33000
## <none> 493862 33007
## + sex 1 0.35 493862 33009
##
## Step: AIC=33000.43
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking + bmi + education
##
## Df Sum of Sq RSS AIC
## <none> 493117 33000
## + sex 1 1.8268 493115 33002
tidy(mod_forward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Forward Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.8000 | 0.6530 | 8.8821 | 0.0000 | 4.5200 | 7.0800 |
| menthlth_days | 0.3042 | 0.0114 | 26.7352 | 0.0000 | 0.2819 | 0.3265 |
| exerciseYes | -3.4505 | 0.2253 | -15.3127 | 0.0000 | -3.8922 | -3.0088 |
| income$25K-$49K | -2.6241 | 0.3073 | -8.5398 | 0.0000 | -3.2264 | -2.0217 |
| income$50K-$99K | -3.2385 | 0.2952 | -10.9701 | 0.0000 | -3.8172 | -2.6598 |
| income$100K+ | -3.8271 | 0.4303 | -8.8949 | 0.0000 | -4.6705 | -2.9837 |
| age_group35-49 | 1.3477 | 0.2981 | 4.5215 | 0.0000 | 0.7634 | 1.9320 |
| age_group50-64 | 2.6452 | 0.2870 | 9.2171 | 0.0000 | 2.0826 | 3.2077 |
| age_group65+ | 2.8828 | 0.2740 | 10.5221 | 0.0000 | 2.3457 | 3.4198 |
| smokingFormer | 0.6768 | 0.2052 | 3.2977 | 0.0010 | 0.2745 | 1.0791 |
| smokingCurrent | 1.1503 | 0.2964 | 3.8807 | 0.0001 | 0.5692 | 1.7313 |
| bmi | 0.0480 | 0.0140 | 3.4339 | 0.0006 | 0.0206 | 0.0754 |
| educationHS/GED | -1.1978 | 0.4508 | -2.6567 | 0.0079 | -2.0815 | -0.3140 |
| educationSome college | -0.7178 | 0.4525 | -1.5863 | 0.1127 | -1.6048 | 0.1692 |
| educationCollege graduate | -1.2319 | 0.4531 | -2.7188 | 0.0066 | -2.1201 | -0.3437 |
#Stepwise selection
mod_stepwise <- step(mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "both", trace = 1)## Start: AIC=34603.45
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 59071 545563 33783
## + exercise 1 37706 566927 34090
## + income 3 32443 572191 34168
## + smoking 2 12018 592616 34447
## + education 3 11873 592761 34451
## + age_group 3 5515 599119 34536
## + bmi 1 4732 599901 34543
## + sex 1 1559 603075 34585
## <none> 604634 34603
##
## Step: AIC=33783.01
## physhlth_days ~ menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 28347 517216 33358
## + income 3 19686 525876 33495
## + age_group 3 14455 531108 33574
## + education 3 8241 537322 33667
## + smoking 2 6070 539492 33697
## + bmi 1 2821 542742 33744
## + sex 1 266 545296 33781
## <none> 545563 33783
## - menthlth_days 1 59071 604634 34603
##
## Step: AIC=33358.15
## physhlth_days ~ menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + income 3 11764 505452 33180
## + age_group 3 9921 507295 33209
## + smoking 2 3953 513263 33301
## + education 3 3409 513807 33311
## + bmi 1 754 516463 33348
## <none> 517216 33358
## + sex 1 107 517109 33359
## - exercise 1 28347 545563 33783
## - menthlth_days 1 49711 566927 34090
##
## Step: AIC=33180.09
## physhlth_days ~ menthlth_days + exercise + income
##
## Df Sum of Sq RSS AIC
## + age_group 3 9400 496052 33036
## + smoking 2 2484 502968 33145
## + bmi 1 782 504670 33170
## + education 3 937 504515 33171
## <none> 505452 33180
## + sex 1 7 505445 33182
## - income 3 11764 517216 33358
## - exercise 1 20424 525876 33495
## - menthlth_days 1 41532 546984 33810
##
## Step: AIC=33035.91
## physhlth_days ~ menthlth_days + exercise + income + age_group
##
## Df Sum of Sq RSS AIC
## + smoking 2 1427 494625 33017
## + education 3 1014 495038 33026
## + bmi 1 667 495385 33027
## <none> 496052 33036
## + sex 1 15 496037 33038
## - age_group 3 9400 505452 33180
## - income 3 11243 507295 33209
## - exercise 1 17199 513251 33307
## - menthlth_days 1 47177 543229 33761
##
## Step: AIC=33016.86
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking
##
## Df Sum of Sq RSS AIC
## + bmi 1 763 493862 33007
## + education 3 780 493845 33010
## <none> 494625 33017
## + sex 1 0 494625 33019
## - smoking 2 1427 496052 33036
## - age_group 3 8344 502968 33145
## - income 3 9897 504522 33169
## - exercise 1 16705 511329 33281
## - menthlth_days 1 44789 539414 33708
##
## Step: AIC=33006.52
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking + bmi
##
## Df Sum of Sq RSS AIC
## + education 3 745 493117 33000
## <none> 493862 33007
## + sex 1 0 493862 33009
## - bmi 1 763 494625 33017
## - smoking 2 1523 495385 33027
## - age_group 3 8245 502107 33133
## - income 3 9781 503643 33157
## - exercise 1 15215 509077 33247
## - menthlth_days 1 44206 538068 33690
##
## Step: AIC=33000.43
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking + bmi + education
##
## Df Sum of Sq RSS AIC
## <none> 493117 33000
## + sex 1 2 493115 33002
## - education 3 745 493862 33007
## - bmi 1 728 493845 33010
## - smoking 2 1295 494412 33017
## - income 3 7909 501026 33122
## - age_group 3 8302 501418 33128
## - exercise 1 14480 507597 33230
## - menthlth_days 1 44141 537257 33684
tidy(mod_stepwise, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Stepwise Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.8000 | 0.6530 | 8.8821 | 0.0000 | 4.5200 | 7.0800 |
| menthlth_days | 0.3042 | 0.0114 | 26.7352 | 0.0000 | 0.2819 | 0.3265 |
| exerciseYes | -3.4505 | 0.2253 | -15.3127 | 0.0000 | -3.8922 | -3.0088 |
| income$25K-$49K | -2.6241 | 0.3073 | -8.5398 | 0.0000 | -3.2264 | -2.0217 |
| income$50K-$99K | -3.2385 | 0.2952 | -10.9701 | 0.0000 | -3.8172 | -2.6598 |
| income$100K+ | -3.8271 | 0.4303 | -8.8949 | 0.0000 | -4.6705 | -2.9837 |
| age_group35-49 | 1.3477 | 0.2981 | 4.5215 | 0.0000 | 0.7634 | 1.9320 |
| age_group50-64 | 2.6452 | 0.2870 | 9.2171 | 0.0000 | 2.0826 | 3.2077 |
| age_group65+ | 2.8828 | 0.2740 | 10.5221 | 0.0000 | 2.3457 | 3.4198 |
| smokingFormer | 0.6768 | 0.2052 | 3.2977 | 0.0010 | 0.2745 | 1.0791 |
| smokingCurrent | 1.1503 | 0.2964 | 3.8807 | 0.0001 | 0.5692 | 1.7313 |
| bmi | 0.0480 | 0.0140 | 3.4339 | 0.0006 | 0.0206 | 0.0754 |
| educationHS/GED | -1.1978 | 0.4508 | -2.6567 | 0.0079 | -2.0815 | -0.3140 |
| educationSome college | -0.7178 | 0.4525 | -1.5863 | 0.1127 | -1.6048 | 0.1692 |
| educationCollege graduate | -1.2319 | 0.4531 | -2.7188 | 0.0066 | -2.1201 | -0.3437 |
In 3-5 sentences, compare the results: • Do the three methods agree on the same final model? • Which variables (if any) were excluded by all three methods? • Which variables were retained by all three methods?
The 3 models all produced the same final mode, indicating strong agreement in variable selection. No variables were excluded by all three models since each method has identical outcomes. The variables retained by all three methods include: menthlth_days, bmi, age_group, income, education, smoking, and exerciseYes. This shows that all predictors are consistently important under all selection models. #Step 4: Final Model Selection and Interpretation
tidy(mod_forward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Final Model: Coefficients with Confidence Intervals",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.8000 | 0.6530 | 8.8821 | 0.0000 | 4.5200 | 7.0800 |
| menthlth_days | 0.3042 | 0.0114 | 26.7352 | 0.0000 | 0.2819 | 0.3265 |
| exerciseYes | -3.4505 | 0.2253 | -15.3127 | 0.0000 | -3.8922 | -3.0088 |
| income$25K-$49K | -2.6241 | 0.3073 | -8.5398 | 0.0000 | -3.2264 | -2.0217 |
| income$50K-$99K | -3.2385 | 0.2952 | -10.9701 | 0.0000 | -3.8172 | -2.6598 |
| income$100K+ | -3.8271 | 0.4303 | -8.8949 | 0.0000 | -4.6705 | -2.9837 |
| age_group35-49 | 1.3477 | 0.2981 | 4.5215 | 0.0000 | 0.7634 | 1.9320 |
| age_group50-64 | 2.6452 | 0.2870 | 9.2171 | 0.0000 | 2.0826 | 3.2077 |
| age_group65+ | 2.8828 | 0.2740 | 10.5221 | 0.0000 | 2.3457 | 3.4198 |
| smokingFormer | 0.6768 | 0.2052 | 3.2977 | 0.0010 | 0.2745 | 1.0791 |
| smokingCurrent | 1.1503 | 0.2964 | 3.8807 | 0.0001 | 0.5692 | 1.7313 |
| bmi | 0.0480 | 0.0140 | 3.4339 | 0.0006 | 0.0206 | 0.0754 |
| educationHS/GED | -1.1978 | 0.4508 | -2.6567 | 0.0079 | -2.0815 | -0.3140 |
| educationSome college | -0.7178 | 0.4525 | -1.5863 | 0.1127 | -1.6048 | 0.1692 |
| educationCollege graduate | -1.2319 | 0.4531 | -2.7188 | 0.0066 | -2.1201 | -0.3437 |
Choose your final model. State which method(s) guided your choice and why. Display the coefficient table for your final model using tidy() with confidence interval. Interpret the coefficients for at least three predictors in plain language, including units and “holding all other variables constant” language. Include at least one continuous and one categorical predictor.
The final model was selected using stepwise selection. This model was chosen because it evaluates predictors in both directions and provides a balance between model fits. Since all three model selections all produced the same final set of predictors, the results were consistent and suggest a stable model.
Holding all other variables constant, each one-unit increase in BMI is associated with an increase of approximately 0.048 physically unhealthy days in the past 30 days. This shows that higher BMI is associated with worse physical health outcomes.
Holding all other variables constant, each additional mentally unhealthy day is associated with an increase of approximately 0.304 physically unhealthy days. This shows a strong relationship between mental and physical health, where worse mental health corresponds to worse physical health.
Holding all other variables constant, individuals who reported any physical activity in the past 30 days had on average 3.45 fewer physically unhealthy days compared to those who did not exercise. This shows that physical activity is strongly associated with better physical health.
Holding all other variables constant, individuals earning $100K+ annually reported about 3.83 fewer physically unhealthy days compared to the lowest income group. This shows that the higher income groups are associated with fewer physically unhealthy days compared to lower income groups.
#Step 5: LINE Assumption Check
Residuals vs. Fitted: Is there evidence of non-linearity or non-constant variance?
There is no evidence of non-linearity and non-constant variance.
Normal Q-Q: Do the residuals approximately follow a normal distribution? Where do they deviate?
The residuals do not perfectly follow a normal distribution. They align fairly well in the middle but deviate in the tails, especially the upper tail, indicating potential skewness or heavy tails.
Scale-Location: Is the spread of residuals roughly constant across fitted values?
The spread of residuals is not constant across fitted values. The upward trend suggests increasing variance as fitted values increase.
Residuals vs. Leverage: Are there any highly influential observations (large Cook’s distance)
There are no strongly influential observations. While a few points have slightly higher leverage, non appear to exeed cooks distnace thresholds.
Write a brief conclusion (2-3 sentences): Are the LINE assumptions reasonably satisfied? What violations, if any, do you observe?
The LINE assumptions are somewhat satisfied. Linearity and constant variance are somewhat violated due to the visible patterns and heteroscedasticity, and normality is imperfect due to tail deviations. However, there are no major issues with influential observations, so the model may still be usable with cation.
#Part 2: Logistic Regression #Step 1: Create the Binary Outcome
brfss_new <- brfss_analytic |>
mutate(
frequent_distress = ifelse(physhlth_days >= 14, 1, 0),
frequent_distress = factor(frequent_distress,
levels = c(0, 1),
labels = c("No", "Yes"))
)
brfss_new |>
count(frequent_distress) |>
mutate(percent = n / sum(n) * 100) |>
mutate(percent = round(percent, 1)) |>
kable(
caption = "Prevalence of Frequent Physical Distress",
col.names = c("Frequent Distress", "Count", "Percent")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Frequent Distress | Count | Percent |
|---|---|---|
| No | 6914 | 86.4 |
| Yes | 1086 | 13.6 |
In 1-2 sentences, comment on the balance of the outcome.
Approximately 13.6% of respondents report frequent physical distress, while 86% do not report frequent physical distress. This shows that the outcome is moderately imbalanced, with a much larger proportion of individuals in the “no” category, which is typical for health burden indicators.
#Step 2: Simple (Unadjusted) Logistic Regression
Choose one predictor from your final linear model that you expect to have a strong association with physical distress. State why you chose it.
I chose mentally unhealthy days because it is a key component of overall health status and is strongly linked to physical health outcomes through biological and behavioral pathways.
mod_ment <- glm(frequent_distress ~ menthlth_days, data = brfss_new,
family = binomial(link = "logit"))
tidy(mod_ment, conf.int = TRUE, exponentiate = FALSE) |>
kable(digits = 3, caption = "Simple Logistic Regression: FPD ~ mentally unhealthy days (Log-Odds Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | -2.281 | 0.042 | -54.333 | 0 | -2.364 | -2.199 |
| menthlth_days | 0.070 | 0.003 | 22.119 | 0 | 0.064 | 0.077 |
mod_ment |>
tbl_regression(
exponentiate = TRUE,
label = list(menthlth_days ~ "Mentally unhealthy days in past 30 days")
) |>
bold_labels() |>
bold_p()| Characteristic | OR | 95% CI | p-value |
|---|---|---|---|
| Mentally unhealthy days in past 30 days | 1.07 | 1.07, 1.08 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
Interpret the odds ratio in plain language. State whether the association is statistically significant at alpha = 0.05 and how you know (Wald test p-value or CI excluding 1.0)
For every one unit increase in mentally unhealthy days, the odds of frequent physical distress are 1.08, with a 95% CI of [1.07,1.08], holding all else constant. This means that individuals with more mentally unhealthy days have higher odds of experiencing frequent physical distress. The association is statistically significant with a p value less than 0.05 (<0.001) and the CI does not include 1.
#Step 3: Multiple Logistic Regression
mod_logistic <- glm(
frequent_distress ~ menthlth_days + bmi + age_group + income + sex +
education + smoking + exercise,
data = brfss_new,
family = binomial)
summary(mod_logistic)##
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + age_group +
## income + sex + education + smoking + exercise, family = binomial,
## data = brfss_new)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.255697 0.243962 -9.246 < 2e-16 ***
## menthlth_days 0.068198 0.003664 18.611 < 2e-16 ***
## bmi 0.014348 0.005155 2.783 0.005380 **
## age_group35-49 0.762689 0.147703 5.164 2.42e-07 ***
## age_group50-64 1.213299 0.139855 8.675 < 2e-16 ***
## age_group65+ 1.291576 0.137163 9.416 < 2e-16 ***
## income$25K-$49K -0.589866 0.101555 -5.808 6.31e-09 ***
## income$50K-$99K -0.853635 0.100733 -8.474 < 2e-16 ***
## income$100K+ -1.498863 0.230273 -6.509 7.56e-11 ***
## sexFemale 0.014039 0.073171 0.192 0.847851
## educationHS/GED -0.336043 0.149017 -2.255 0.024129 *
## educationSome college -0.148256 0.149889 -0.989 0.322613
## educationCollege graduate -0.494821 0.155068 -3.191 0.001418 **
## smokingFormer 0.296136 0.081651 3.627 0.000287 ***
## smokingCurrent 0.358801 0.105610 3.397 0.000680 ***
## exerciseYes -0.885038 0.076998 -11.494 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6354.8 on 7999 degrees of freedom
## Residual deviance: 5280.5 on 7984 degrees of freedom
## AIC: 5312.5
##
## Number of Fisher Scoring iterations: 6
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 3))) |>
kable(
caption = "Adjusted Odds Ratios for Frequent Physical Distress",
digits = 3
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.105 | 0.244 | -9.246 | 0.000 | 0.065 | 0.168 |
| menthlth_days | 1.071 | 0.004 | 18.611 | 0.000 | 1.063 | 1.078 |
| bmi | 1.014 | 0.005 | 2.783 | 0.005 | 1.004 | 1.025 |
| age_group35-49 | 2.144 | 0.148 | 5.164 | 0.000 | 1.610 | 2.875 |
| age_group50-64 | 3.365 | 0.140 | 8.675 | 0.000 | 2.569 | 4.448 |
| age_group65+ | 3.639 | 0.137 | 9.416 | 0.000 | 2.795 | 4.787 |
| income$25K-$49K | 0.554 | 0.102 | -5.808 | 0.000 | 0.454 | 0.677 |
| income$50K-$99K | 0.426 | 0.101 | -8.474 | 0.000 | 0.350 | 0.519 |
| income$100K+ | 0.223 | 0.230 | -6.509 | 0.000 | 0.139 | 0.345 |
| sexFemale | 1.014 | 0.073 | 0.192 | 0.848 | 0.879 | 1.171 |
| educationHS/GED | 0.715 | 0.149 | -2.255 | 0.024 | 0.535 | 0.960 |
| educationSome college | 0.862 | 0.150 | -0.989 | 0.323 | 0.645 | 1.160 |
| educationCollege graduate | 0.610 | 0.155 | -3.191 | 0.001 | 0.451 | 0.829 |
| smokingFormer | 1.345 | 0.082 | 3.627 | 0.000 | 1.145 | 1.577 |
| smokingCurrent | 1.432 | 0.106 | 3.397 | 0.001 | 1.162 | 1.758 |
| exerciseYes | 0.413 | 0.077 | -11.494 | 0.000 | 0.355 | 0.480 |
Interpret the adjusted odds ratios for at least three predictors in plain language, using “holding all other variables constant” language. Include at least one continuous and one categorical predictor.
Holding all other variables constant, each additional of mentally unhealthy days is associated with 1.071 times the odds of frequent psychical distress(95% CI: 1.063 to 1.078). This shows that the worse mental health is, the higher odds of frequent physical distress after adjusting for other predictors.
Holding all other variables constant, each one-unit increase in BMI is associated with 1.014 times the odds of frequent physical distress (95% CI: 1.004 to 1.025). This shows that the higher the BMI, the higher the odds of frequent physical distress.
Holding all other variables constant, individuals who reported any physical activity in the past 30 days have 0.413 times the odds of frequent physical distress compared to those who did not exercise (95% CI: 0.355 to 0.480).This shows that physical activity is strongly associated with lower odds of frequent physical distress. #Step 4: Likelihood Ratio Test
mod_reduced <- glm(
frequent_distress ~ menthlth_days + bmi + age_group + income + sex +
smoking + exercise,
data = brfss_new,
family = binomial
)
anova(mod_reduced, mod_logistic, test = "LRT") |>
kable(digits = 3,
caption = "LR Test: Does removing education improve the model?") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
|---|---|---|---|---|
| 7987 | 5299.505 | NA | NA | NA |
| 7984 | 5280.523 | 3 | 18.982 | 0 |
• State the null and alternative hypotheses • Report the test statistic (deviance), degrees of freedom, and p-value • Conclude at alpha = 0.05: Does this group of predictors significantly improve the model?
Null hypothesis: Education level is not associated with frequent physical distress(all education coefficients=0).
Alternative hypothesis: Education level is associated with frequent physical distress(at least one education coefficients≠0).
Chi-square (Deviance) = 18.982 Degrees of freedom = 3 p-value = <0.001
At a p-value of less than 0.05, we can reject the null hypothesis because the p-value is <0.001. This group of predictors significantly improves the model.
#Step 5: ROC Curve and AUC
roc_obj <- roc(
response = brfss_new$frequent_distress,
predictor = fitted(mod_logistic),
levels = c("No", "Yes"),
direction = "<"
)
auc_value <- auc(roc_obj)
ggroc(roc_obj, color = "steelblue", linewidth = 1.2) +
geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "red") +
labs(title = "ROC Curve for Frequent Mental Distress Model",
subtitle = paste0("AUC = ", round(auc_value, 3)),
x = "Specificity", y = "Sensitivity") +
theme_minimal()Interpret the AUC: What does this value tell you about the model’s ability to discriminate between individuals with and without frequent physical distress? Use the following guide: • 0.5 = no discrimination (coin flip) • 0.5-0.7 = poor discrimination • 0.7-0.8 = acceptable discrimination • 0.8-0.9 = excellent discrimination • 0.9+ = outstanding discrimination
The area under the ROC curve for is 0.777.This shows an acceptable discrimination, meaning the model has a good ability to distinguish between individuals with and without frequent physical distress. #Step 6: Hosmer-Lemeshow Goodness-of-Fit Test
hl_test <- hoslem.test(
x = as.numeric(brfss_new$frequent_distress) - 1,
y = fitted(mod_logistic),
g = 10
)
hl_test##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: as.numeric(brfss_new$frequent_distress) - 1, fitted(mod_logistic)
## X-squared = 11.097, df = 8, p-value = 0.1963
State the null hypothesis (the model fits the data well) and alternative hypothesis (the model does not fit the data well) • Report the test statistic, degrees of freedom, and p-value • Interpret: At alpha = 0.05, is there evidence of poor model fit? • In 1-2 sentences, discuss how the Hosmer-Lemeshow result complements the ROC/AUC finding
Null hypothesis (H₀): The logistic regression model fits the data well (no evidence of poor fit).
Alternative hypothesis (H₁): The logistic regression model does not fit the data well (evidence of poor fit).
Chi-square (X²) = 11.097 Degrees of freedom = 8 p-value = 0.1963
We fail to reject the null hypothesis because the p-value (0.1963) is greater than 0.05, This shows no evidence of poor model fit, suggesting the model fits the data adequately. The Hosmer-Lemeshow test evaluates calibration, while ROC/AUC measures discrimination. Together they suggest the model both fits the data reasonably well and have acceptable performance.
#Part 3: Reflection Write 250-400 words in continuous prose (no bullet points). A. Statistical Insights (5 points): What do your results suggest about the factors associated with physical health burden among U.S. adults? Which predictors were the strongest in both the linear and logistic models? Were any predictors significant in one model but not the other? What are the key limitations of using cross-sectional survey data for this type of analysis? Name at least two specific potential confounders not measured in your model.
The results suggest that both behavioral and socioeconomic predictors play an important role in physical health. Across both the linear and logistic models, mentally unhealthy days emerged as one of the strongest predictors showing a large and statistically significant association with worse health outcomes. In addition, exercise demonstrated a strong effect as well with individuals who reported physical activity having fewer unhealthy days and lower odds of frequent physical distress than those who didn’t participant in physical activity. Income was associated with better health for those who had higher income. For age, individuals who were older had worse outcomes. Most predictors were consistent across both models, although some variables were not statistically significant in one model while appearing more relevant in other. A key to using cross-sectional data is the lack of causality. In addition, there may be confounding variables that were not measured that could be influencing the preditors and outcomes.
B. Linear versus Logistic Regression (5 points): Compare what the linear regression model tells you versus what the logistic regression model tells you about the same research question. What information does each approach provide that the other does not? In what situations would you prefer one approach over the other? How do the model selection criteria differ between the two frameworks (for example, R-squared versus AUC)?
The linear model estimates changes in the number of physically unhealthy days with a direct interpretation in terms of units, while the logistic model estimates odds of experiencing frequent physical distress which is used to see who is classified as high risk. The linear model uses r-squared to assess variance, while the logistic models uses measures like AUC to evaluate classification performance. Linear regression is preferred when the outcome is continuous, while logistic regression is used when the outcome is binary.
C. AI Transparency (5 points): Describe any parts of this assignment where you sought AI assistance. Include the specific prompts you used and how you verified the correctness of the output. If you did not use AI, state this explicitly.
I used AI to help with debugging. For example, when my code was not running for initial setup of brfss_2023, I asked co-pilot “why is my code not running” and I would insert the code that I had written. For initial set up, I had used “clean_names” function, which standardized variable names by removing special characters, so I did not need to include the “_BMI” and I needed to just have “BMI” for all the predictors like this, which at the time I did not realize. Co-pilot was able to tell me to take the underscore away and just use the name.