library(tidyverse)
library(dplyr)
library(haven) #for reading XPT files
library(broom) #for tidying model output
library(kableExtra) #for formatted tables
library(janitor)
library(car) #for diagnostic tools
library(gtsummary) #for descriptive tables
library(ggeffects)
library(leaps) #for best subsets regression
library (pROC) #for ROC curves and AUC
library (ResourceSelection) #for Hosmer-Lemeshow testbrfss_raw <- read_xpt("/Users/nataliasmall/Downloads/EPI 553/LLCP2023.XPT ") %>%
clean_names()
#Total number of rows and columns in raw dataset
cat("Total Rows:", nrow(brfss_raw), "\n")## Total Rows: 433323
## Total Columns: 350
brfss_var <- brfss_raw |>
dplyr::select(physhlth, menthlth, bmi5, sexvar, exerany2, addepev3, ageg5yr, incomg1, educa, smoker3, genhlth, marital)# ── Recode and clean ──────────────────────────────────────────────────────────
brfss_cleaned <- brfss_var %>%
mutate(
# Outcome: physically unhealthy days in past 30 (88 = none = 0; 77/99 = NA; 1-30 as is)
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# mentally unhealthy days in past 30 (88 = none = 0; 77/99 = NA; 1-30 as is)
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
# Body mass index × 100 (divide by 100)
bmi = ifelse(bmi5 == 9999, NA, bmi5 / 100),
# Sex assigned at birth (1 = Male, 2 = Female)
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
# Exercise in past 30 days (1 = Yes, 2 = No)
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
# Depression: Ever told depressive disorder
depression = factor(case_when(
addepev3 == 1 ~ "Yes",
addepev3 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
# Age group
age_group = factor (case_when(
ageg5yr %in% 1:3 ~ "18-34",
ageg5yr %in% 4:6 ~ "35-49",
ageg5yr %in% 7:9 ~ "50-64",
ageg5yr %in% 10:13 ~ "65+",
ageg5yr == 14 ~ NA_character_,
TRUE ~ NA_character_
), levels = c("18-34", "35-49", "50-64", "65+")),
# Annual household income
income = factor (case_when(
incomg1 %in% 1:2 ~ "Less than $25K",
incomg1 %in% 3:4 ~ "$25K-$49K",
incomg1 %in% 5:6 ~ "$50K-$99K",
incomg1 == 7 ~ "$100K+",
incomg1 == 9 ~ NA_character_,
TRUE ~ NA_character_
), levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),
# Education
education = factor(case_when(
educa %in% 1:3 ~ "Less than HS",
educa == 4 ~ "HS/GED",
educa == 5 ~ "Some college",
educa == 6 ~ "College graduate",
educa == 9 ~ NA_character_,
TRUE ~ NA_character_
), levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),
# Smoking
smoking = factor(case_when(
smoker3 %in% 1:2 ~ "Current",
smoker3 == 3 ~ "Former",
smoker3 == 4 ~ "Never",
smoker3 == 9 ~ NA_character_,
TRUE ~ NA_character_
), levels = c("Never", "Current", "Former")),
# General Health Status
gen_health = factor(case_when(
genhlth %in% 1:2 ~ "Excellent/Very Good",
genhlth == 3 ~ "Good",
genhlth %in% 4:5 ~ "Fair/Poor",
genhlth %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
), levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
# Marital status
marital = factor (case_when(
marital %in% c(1, 6) ~ "Married/Partnered",
marital %in% 2:4. ~ "Previously married",
marital == 5 ~ "Never married",
marital == 9 ~ NA_character_,
TRUE ~ NA_character_
), levels = c("Married/Partnered", "Previously married", "Never married")),
)%>%
dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
age_group, income, education, smoking, gen_health, marital)# Number and percentage of missing observations for physhlth_days and at least two other key variables, before removing any missing values
missing_report <- brfss_cleaned %>%
dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
age_group, income, education, smoking, gen_health, marital) %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") %>%
mutate(Pct_Missing = round(100 * (N_Missing / nrow(brfss_cleaned)), 1))
missing_report %>%
kable(caption = "Number And Percentage Of Missing Observations Before Exclusions",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Variable | N_Missing | Pct_Missing |
|---|---|---|
| physhlth_days | 10785 | 2.5 |
| menthlth_days | 8108 | 1.9 |
| bmi | 40535 | 9.4 |
| sex | 0 | 0.0 |
| exercise | 1251 | 0.3 |
| depression | 2587 | 0.6 |
| age_group | 7779 | 1.8 |
| income | 86623 | 20.0 |
| education | 2325 | 0.5 |
| smoking | 23062 | 5.3 |
| gen_health | 1262 | 0.3 |
| marital | 4289 | 1.0 |
# ── Analytic sample (reproducible random sample) ────────────────────────────
set.seed(1220)
brfss_analytic <- brfss_cleaned |>
drop_na(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
age_group, income, education, smoking, gen_health, marital) |>
slice_sample(n = 8000)
# Save
saveRDS(brfss_analytic,
"/Users/nataliasmall/Downloads/EPI 553/brfss_hw4_analytic.rds")
# Report final analytic n
tibble(Metric = c("Observations", "Variables"),
Value = c(nrow(brfss_analytic), ncol(brfss_analytic))) %>%
kable(caption = "Analytic Dataset Dimensions") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Observations | 8000 |
| Variables | 12 |
brfss_analytic |>
tbl_summary(
include = c(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
age_group, income, education, smoking, gen_health, marital),
type = list(
c(physhlth_days, menthlth_days, bmi) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})"
),
label = list(
physhlth_days ~ "Physically unhealthy days (past 30)",
menthlth_days ~ "Mentally unhealthy days (past 30)",
bmi ~ "Body mass index",
sex ~ "Sex",
exercise ~ "Any physical activity (past 30 days)",
depression ~ "Ever told depressive disorder",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Highest level of education completed",
smoking ~ "Smoking status",
gen_health ~ "General health status",
marital ~ "Marital status"
)
) |>
add_n() |>
add_overall() |>
bold_labels()| Characteristic | N | N = 8,0001 |
|---|---|---|
| Physically unhealthy days (past 30) | 8,000 | 4 (9) |
| Mentally unhealthy days (past 30) | 8,000 | 4 (8) |
| Body mass index | 8,000 | 29 (7) |
| Sex | 8,000 | |
| Male | 3,971 (50%) | |
| Female | 4,029 (50%) | |
| Any physical activity (past 30 days) | 8,000 | 6,157 (77%) |
| Ever told depressive disorder | 8,000 | 1,776 (22%) |
| Age group | 8,000 | |
| 18-34 | 1,294 (16%) | |
| 35-49 | 1,657 (21%) | |
| 50-64 | 2,109 (26%) | |
| 65+ | 2,940 (37%) | |
| Annual household income | 8,000 | |
| Less than $25K | 1,090 (14%) | |
| $25K-$49K | 1,931 (24%) | |
| $50K-$99K | 4,297 (54%) | |
| $100K+ | 682 (8.5%) | |
| Highest level of education completed | 8,000 | |
| Less than HS | 391 (4.9%) | |
| HS/GED | 1,887 (24%) | |
| Some college | 2,115 (26%) | |
| College graduate | 3,607 (45%) | |
| Smoking status | 8,000 | |
| Never | 4,808 (60%) | |
| Current | 962 (12%) | |
| Former | 2,230 (28%) | |
| General health status | 8,000 | |
| Excellent/Very Good | 3,926 (49%) | |
| Good | 2,648 (33%) | |
| Fair/Poor | 1,426 (18%) | |
| Marital status | 8,000 | |
| Married/Partnered | 4,582 (57%) | |
| Previously married | 2,050 (26%) | |
| Never married | 1,368 (17%) | |
| 1 Mean (SD); n (%) | ||
# Fit a multiple linear regression model for at least 8 predictors
model_mlr <- lm(physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + age_group +
income + education + smoking + gen_health + marital, data = brfss_analytic)
# Display the full summary() output
summary(model_mlr)##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise +
## depression + age_group + income + education + smoking + gen_health +
## marital, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.795 -2.839 -1.145 0.690 31.047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.20132 0.62746 3.508 0.000453 ***
## menthlth_days 0.18362 0.01135 16.172 < 2e-16 ***
## bmi -0.01877 0.01287 -1.459 0.144638
## sexFemale 0.23449 0.16389 1.431 0.152538
## exerciseYes -1.93289 0.20408 -9.471 < 2e-16 ***
## depressionYes 0.77536 0.21528 3.602 0.000318 ***
## age_group35-49 0.76935 0.28365 2.712 0.006696 **
## age_group50-64 1.40108 0.27874 5.026 5.11e-07 ***
## age_group65+ 1.72725 0.27852 6.202 5.87e-10 ***
## income$25K-$49K -1.08440 0.27627 -3.925 8.74e-05 ***
## income$50K-$99K -1.52385 0.27607 -5.520 3.50e-08 ***
## income$100K+ -1.31580 0.39821 -3.304 0.000956 ***
## educationHS/GED 0.29371 0.40089 0.733 0.463806
## educationSome college 1.00198 0.40273 2.488 0.012867 *
## educationCollege graduate 1.05262 0.40445 2.603 0.009269 **
## smokingCurrent 0.46414 0.26617 1.744 0.081241 .
## smokingFormer 0.43777 0.18776 2.332 0.019749 *
## gen_healthGood 1.40982 0.18635 7.565 4.30e-14 ***
## gen_healthFair/Poor 10.06509 0.25239 39.879 < 2e-16 ***
## maritalPreviously married -0.26701 0.20679 -1.291 0.196680
## maritalNever married -0.41244 0.24899 -1.656 0.097666 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared: 0.3258, Adjusted R-squared: 0.3242
## F-statistic: 192.8 on 20 and 7979 DF, p-value: < 2.2e-16
# Report and interpret: R-squared, Adjusted R-squared, AIC (using AIC()), and BIC (using BIC())
AIC(model_mlr)## [1] 54115.05
## [1] 54268.77
R-squared: 0.3258 Adjusted R-squared: 0.3242 AIC: 54115.05 BIC: 54268.77 Interpretation: The model has an R-squared of 0.3258, indicating that the included predictors explain roughly 32.6% of the variance in physical health days. The small difference between the R-squared and Adjusted R-squared, 0.3242, suggests the model is not over-fitted. AIC and BIC provide a baseline for model fit, in future comparisons, a model with lower values for these criteria would be considered a better fit for the data.
# Use leaps::regsubsets() to perform best subsets regression on maximum model
# Set nvmax equal to the total number of parameters in your maximum model
best_subsets <- regsubsets(physhlth_days ~ menthlth_days + bmi + sex + exercise + depression + age_group +
income + education + smoking + gen_health + marital, data = brfss_analytic,
nvmax = 18,
method = "exhaustive")
best_summary <- summary(best_subsets)
# Plot showing Adjusted R-squared and BIC across model sizes (number of predictors)
subset_metrics <- tibble(
p = 1:length(best_summary$adjr2),
`Adj. R²` = best_summary$adjr2,
BIC = best_summary$bic,
Cp = best_summary$cp
)
p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.max(best_summary$adjr2),
linetype = "dashed", color = "tomato") +
labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
theme_minimal(base_size = 12)
p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.min(best_summary$bic),
linetype = "dashed", color = "tomato") +
labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
theme_minimal(base_size = 12)
gridExtra::grid.arrange(p1, p2, ncol = 2)## Best model by Adj. R²: 18 variables
## Best model by BIC: 9 variables
Interpretation: The two criteria do not agree on the best model size. The Adjusted R-squared is maximized at 18 variables, while the BIC is minimized at 9 variables. This sugests BIC applies a stronger penalty for model complexity, perfering a simpler model compared to Adjusted R-squared.
# Backward elimination: Start from the maximum model (direction = "backward")
# Automated backward elimination using AIC
mod_backward <- step(model_mlr, direction = "backward", trace = 1)## Start: AIC=31410.04
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health + marital
##
## Df Sum of Sq RSS AIC
## - marital 2 180 403789 31410
## <none> 403609 31410
## - sex 1 104 403712 31410
## - bmi 1 108 403716 31410
## - smoking 2 347 403956 31413
## - depression 1 656 404265 31421
## - education 3 876 404485 31421
## - income 3 1550 405158 31435
## - age_group 3 2235 405844 31448
## - exercise 1 4537 408146 31497
## - menthlth_days 1 13230 416839 31666
## - gen_health 2 84670 488279 32930
##
## Step: AIC=31409.61
## physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
##
## Df Sum of Sq RSS AIC
## - bmi 1 98 403887 31410
## <none> 403789 31410
## - sex 1 102 403891 31410
## - smoking 2 345 404134 31412
## - depression 1 637 404427 31420
## - education 3 875 404665 31421
## - income 3 1388 405177 31431
## - age_group 3 3048 406837 31464
## - exercise 1 4538 408327 31497
## - menthlth_days 1 13195 416984 31665
## - gen_health 2 84707 488496 32929
##
## Step: AIC=31409.55
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
# Forward selection: Start from an intercept-only model with the maximum model as the upper scope (direction = "forward")
# Automated forward selection using AIC
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)
mod_forward <- step(mod_null,
scope = list(lower = mod_null, upper = model_mlr),
direction = "forward", trace = 1)## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + bmi 1 6253 592434 34442
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940.1 416725 31632
## + exercise 1 6466.7 428198 31849
## + depression 1 6070.1 428595 31857
## + income 3 3231.0 431434 31913
## + smoking 2 1776.3 432889 31938
## + age_group 3 1535.7 433129 31945
## + sex 1 1254.2 433411 31946
## + marital 2 830.3 433835 31956
## + education 3 415.5 434250 31965
## <none> 434665 31967
## + bmi 1 0.2 434665 31969
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5820.8 410904 31521
## + age_group 3 4661.5 412064 31548
## + income 3 1987.0 414738 31600
## + marital 2 1402.5 415322 31609
## + smoking 2 1034.6 415690 31616
## + depression 1 738.1 415987 31620
## + sex 1 605.2 416120 31622
## + education 3 318.9 416406 31632
## <none> 416725 31632
## + bmi 1 3.6 416721 31634
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720.5 407184 31455
## + income 3 1198.4 409706 31504
## + marital 2 1064.8 409839 31505
## + depression 1 739.1 410165 31509
## + smoking 2 706.2 410198 31512
## + education 3 686.9 410217 31514
## + sex 1 449.3 410455 31515
## <none> 410904 31521
## + bmi 1 82.9 410821 31522
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1162.69 406021 31438
## + depression 1 932.70 406251 31438
## + education 3 501.69 406682 31451
## + sex 1 261.03 406923 31452
## + smoking 2 360.51 406823 31452
## <none> 407184 31455
## + bmi 1 83.59 407100 31455
## + marital 2 40.11 407144 31458
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 861.57 405159 31423
## + education 3 953.62 405067 31425
## + smoking 2 305.02 405716 31436
## + sex 1 203.01 405818 31436
## <none> 406021 31438
## + bmi 1 74.91 405946 31438
## + marital 2 153.70 405867 31439
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837.21 404322 31412
## + smoking 2 259.11 404900 31422
## + sex 1 110.16 405049 31422
## + bmi 1 105.12 405054 31423
## <none> 405159 31423
## + marital 2 169.72 404990 31423
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332.18 403990 31410
## + bmi 1 110.19 404212 31412
## <none> 404322 31412
## + sex 1 77.87 404244 31413
## + marital 2 168.26 404154 31413
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 102.92 403887 31410
## <none> 403990 31410
## + bmi 1 98.80 403891 31410
## + marital 2 169.54 403820 31410
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## + bmi 1 97.861 403789 31410
## + marital 2 170.601 403716 31410
# Stepwise selection: Start from the intercept-only model with both directions allowed (direction = "both")
mod_stepwise <- step(mod_null,
scope = list(lower = mod_null, upper = model_mlr),
direction = "both", trace = 1)## Start: AIC=34524.38
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 164022 434665 31967
## + menthlth_days 1 60303 538384 33677
## + exercise 1 34542 564145 34051
## + income 3 28842 569845 34135
## + depression 1 20750 577937 34244
## + smoking 2 12790 585897 34356
## + education 3 8593 590094 34415
## + marital 2 7321 591366 34430
## + bmi 1 6253 592434 34442
## + age_group 3 6527 592160 34443
## + sex 1 1649 597038 34504
## <none> 598687 34524
##
## Step: AIC=31967.07
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 17940 416725 31632
## + exercise 1 6467 428198 31849
## + depression 1 6070 428595 31857
## + income 3 3231 431434 31913
## + smoking 2 1776 432889 31938
## + age_group 3 1536 433129 31945
## + sex 1 1254 433411 31946
## + marital 2 830 433835 31956
## + education 3 416 434250 31965
## <none> 434665 31967
## + bmi 1 0 434665 31969
## - gen_health 2 164022 598687 34524
##
## Step: AIC=31631.88
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 5821 410904 31521
## + age_group 3 4661 412064 31548
## + income 3 1987 414738 31600
## + marital 2 1402 415322 31609
## + smoking 2 1035 415690 31616
## + depression 1 738 415987 31620
## + sex 1 605 416120 31622
## + education 3 319 416406 31632
## <none> 416725 31632
## + bmi 1 4 416721 31634
## - menthlth_days 1 17940 434665 31967
## - gen_health 2 121660 538384 33677
##
## Step: AIC=31521.35
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3720 407184 31455
## + income 3 1198 409706 31504
## + marital 2 1065 409839 31505
## + depression 1 739 410165 31509
## + smoking 2 706 410198 31512
## + education 3 687 410217 31514
## + sex 1 449 410455 31515
## <none> 410904 31521
## + bmi 1 83 410821 31522
## - exercise 1 5821 416725 31632
## - menthlth_days 1 17294 428198 31849
## - gen_health 2 101805 512709 33288
##
## Step: AIC=31454.58
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + income 3 1163 406021 31438
## + depression 1 933 406251 31438
## + education 3 502 406682 31451
## + sex 1 261 406923 31451
## + smoking 2 361 406823 31451
## <none> 407184 31455
## + bmi 1 84 407100 31455
## + marital 2 40 407144 31458
## - age_group 3 3720 410904 31521
## - exercise 1 4880 412064 31548
## - menthlth_days 1 19957 427141 31835
## - gen_health 2 94875 502059 33126
##
## Step: AIC=31437.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income
##
## Df Sum of Sq RSS AIC
## + depression 1 862 405159 31423
## + education 3 954 405067 31425
## + smoking 2 305 405716 31436
## + sex 1 203 405818 31436
## <none> 406021 31438
## + bmi 1 75 405946 31438
## + marital 2 154 405867 31439
## - income 3 1163 407184 31455
## - age_group 3 3685 409706 31504
## - exercise 1 4214 410235 31518
## - menthlth_days 1 18945 424966 31801
## - gen_health 2 87509 493530 32995
##
## Step: AIC=31422.71
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression
##
## Df Sum of Sq RSS AIC
## + education 3 837 404322 31412
## + smoking 2 259 404900 31422
## + sex 1 110 405049 31423
## + bmi 1 105 405054 31423
## <none> 405159 31423
## + marital 2 170 404990 31423
## - depression 1 862 406021 31438
## - income 3 1092 406251 31438
## - age_group 3 3871 409030 31493
## - exercise 1 4219 409378 31504
## - menthlth_days 1 13674 418834 31686
## - gen_health 2 86610 491770 32969
##
## Step: AIC=31412.16
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education
##
## Df Sum of Sq RSS AIC
## + smoking 2 332 403990 31410
## + bmi 1 110 404212 31412
## <none> 404322 31412
## + sex 1 78 404244 31413
## + marital 2 168 404154 31413
## - education 3 837 405159 31423
## - depression 1 745 405067 31425
## - income 3 1495 405818 31436
## - age_group 3 3558 407880 31476
## - exercise 1 4604 408926 31501
## - menthlth_days 1 13607 417929 31675
## - gen_health 2 86813 491135 32964
##
## Step: AIC=31409.59
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking
##
## Df Sum of Sq RSS AIC
## + sex 1 103 403887 31410
## <none> 403990 31410
## + bmi 1 99 403891 31410
## + marital 2 170 403820 31410
## - smoking 2 332 404322 31412
## - depression 1 691 404681 31421
## - education 3 910 404900 31422
## - income 3 1457 405447 31432
## - age_group 3 3178 407168 31466
## - exercise 1 4502 408492 31496
## - menthlth_days 1 13347 417337 31668
## - gen_health 2 85552 489542 32942
##
## Step: AIC=31409.55
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
##
## Df Sum of Sq RSS AIC
## <none> 403887 31410
## - sex 1 103 403990 31410
## + bmi 1 98 403789 31410
## + marital 2 171 403716 31410
## - smoking 2 357 404244 31413
## - depression 1 612 404499 31420
## - education 3 875 404762 31421
## - income 3 1394 405281 31431
## - age_group 3 3049 406936 31464
## - exercise 1 4451 408338 31495
## - menthlth_days 1 13238 417125 31666
## - gen_health 2 85645 489532 32944
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
Interpretation: The backward, forward, and stepwise methods all agree on the same final model. This model includes nine predictors: mental health days, sex, exercise, depression, age group, income, education, smoking status, and general health. Variables such as BMI and marital status were consistently excluded across all methods, indicating they did not provide enough explanatory power to justify their inclusion based on the AIC criterion.
# Compare all selection methods
method_comparison <- tribble(
~Method, ~`Variables selected`, ~`Adj. R²`, ~AIC, ~BIC,
"Maximum model",
length(coef(model_mlr)) - 1,
round(glance(model_mlr)$adj.r.squared, 4),
round(AIC(model_mlr), 1),
round(BIC(model_mlr), 1),
"Backward (AIC)",
length(coef(mod_backward)) - 1,
round(glance(mod_backward)$adj.r.squared, 4),
round(AIC(mod_backward), 1),
round(BIC(mod_backward), 1),
"Forward (AIC)",
length(coef(mod_forward)) - 1,
round(glance(mod_forward)$adj.r.squared, 4),
round(AIC(mod_forward), 1),
round(BIC(mod_forward), 1),
"Stepwise (AIC)",
length(coef(mod_stepwise)) - 1,
round(glance(mod_stepwise)$adj.r.squared, 4),
round(AIC(mod_stepwise), 1),
round(BIC(mod_stepwise), 1)
)
method_comparison |>
kable(caption = "Comparison of Variable Selection Methods") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Method | Variables selected | Adj. R² | AIC | BIC |
|---|---|---|---|---|
| Maximum model | 20 | 0.3242 | 54115.1 | 54268.8 |
| Backward (AIC) | 17 | 0.3239 | 54114.6 | 54247.3 |
| Forward (AIC) | 17 | 0.3239 | 54114.6 | 54247.3 |
| Stepwise (AIC) | 17 | 0.3239 | 54114.6 | 54247.3 |
# Coefficient table for final model using tidy() with confidence intervals
final_model <- mod_stepwise
tidy(final_model, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Final Model Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.3853 | 0.4874 | 2.8425 | 0.0045 | 0.4300 | 2.3407 |
| gen_healthGood | 1.3636 | 0.1839 | 7.4161 | 0.0000 | 1.0032 | 1.7241 |
| gen_healthFair/Poor | 10.0009 | 0.2486 | 40.2245 | 0.0000 | 9.5136 | 10.4883 |
| menthlth_days | 0.1836 | 0.0114 | 16.1748 | 0.0000 | 0.1614 | 0.2059 |
| exerciseYes | -1.9048 | 0.2031 | -9.3793 | 0.0000 | -2.3029 | -1.5067 |
| age_group35-49 | 0.8374 | 0.2703 | 3.0980 | 0.0020 | 0.3075 | 1.3672 |
| age_group50-64 | 1.4778 | 0.2583 | 5.7206 | 0.0000 | 0.9714 | 1.9841 |
| age_group65+ | 1.8366 | 0.2513 | 7.3099 | 0.0000 | 1.3441 | 2.3292 |
| income$25K-$49K | -1.0324 | 0.2749 | -3.7557 | 0.0002 | -1.5713 | -0.4936 |
| income$50K-$99K | -1.3884 | 0.2658 | -5.2239 | 0.0000 | -1.9094 | -0.8674 |
| income$100K+ | -1.1122 | 0.3849 | -2.8897 | 0.0039 | -1.8666 | -0.3577 |
| depressionYes | 0.7471 | 0.2149 | 3.4769 | 0.0005 | 0.3259 | 1.1683 |
| educationHS/GED | 0.2576 | 0.4006 | 0.6431 | 0.5202 | -0.5276 | 1.0428 |
| educationSome college | 0.9636 | 0.4025 | 2.3943 | 0.0167 | 0.1747 | 1.7525 |
| educationCollege graduate | 1.0310 | 0.4043 | 2.5502 | 0.0108 | 0.2385 | 1.8236 |
| smokingCurrent | 0.4776 | 0.2647 | 1.8038 | 0.0713 | -0.0414 | 0.9965 |
| smokingFormer | 0.4399 | 0.1876 | 2.3442 | 0.0191 | 0.0720 | 0.8077 |
| sexFemale | 0.2329 | 0.1633 | 1.4262 | 0.1539 | -0.0872 | 0.5530 |
Choose final model: All three automated methods selected the same model with 17 predictor terms (Adjusted R² = 0.3239, AIC = 54114.6, BIC = 54247.3). This model has a lower AIC and BIC than the maximum model (AIC = 54115.1, BIC = 54268.8), affirming that excluding BMI and marital status would improve model parsimony without sacrificing fit. The modest improvement in BIC (21.5 points lower) is more notable than the AIC improvement (0.5 points lower), consistent with BIC’s stronger preference for simpler models. All three methods optimize the same criterion (AIC) on the same data. Thus, the maximum model and the model identified by the Backward, Forward, and Stepwise selection procedure produce very similar predictions, but the identified model is preferred due to its efficiency.
Interpretation: Mentally Unhealthy Days Holding all other variables constant, each additional mentally unhealthy day was associated with an average increase of 0.1836 physically unhealthy days.
Interpretation: Exercise Holding all other variables constant, individuals who do exercise have approximately 1.9048 fewer physically unhealthy days on average compared to individuals who do not exercise (the reference group).
Interpretation: income 25k- 49k Holding all other variables constant, individuals reporting incomes 25K-49K had approximately 1.0324 fewer physically unhealthy days on average compared to individuals reporting incomes less than $25K (the reference group).
# Using final linear model, produce the four base R diagnostic plots
par(mfrow = c(2, 2))
plot(final_model)Residuals vs Fitted: The red line is relatively flat, suggesting the Linearity assumption is mostly met. Q-Q Residuals: The points curve away from the line at the top right, suggesting a heavy tail. The residuals do not follow a normal distribution, deviating at the top right. Scale-Location: The red line is fairly horizontal, suggesting Homoscedasticity, constant variance, is generally acceptable. The spread of residuals are roughly constant across fitted values. Residuals vs Leverage: There are no highly influential observations that are pulling the model too far in any one direction.
Conclusion: The LINE assumptions are reasonably satisfied, since the Residuals vs. Fitted and Scale-Location plots show a relatively flat trend and consistent variance. The primary violation is seen in the Q-Q plot, where the residuals deviate from the diagonal line at the upper tail, indicating non-normality. But, given the large sample size (n=8,000) and the discrete nature of the physically unhealthy days, the model remains reliable for interpretation.
# Create the Binary Outcome
brfss_logistic <- brfss_analytic %>%
mutate(phys_distress = ifelse(physhlth_days >= 14, 1, 0)) %>%
mutate(phys_distress = factor(phys_distress,
levels = c(0, 1),
labels = c("No", "Yes")))
# Prevalence of frequent physical distress
Frequency_physdis = cbind(freq = table(brfss_logistic$phys_distress), percent = prop.table(table(brfss_logistic$phys_distress)))
Frequency_physdis |>
kable(caption = "Frequency Table: Frequent Physical Distress") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| freq | percent | |
|---|---|---|
| No | 6948 | 0.8685 |
| Yes | 1052 | 0.1315 |
Interpretation: The outcome is unbalanced, with only about 13.2% of the sample experiencing frequent physical distress compared to 86.9% who do not.
One Predictor: One predictor from the final linear model that I expect to have a strong association with physical distress is mentally unhealthy days. In the linear model, mental health days had one of the strongest statistical associations with physical health, proven by its large t-statistic: and the significant jump in RSS when removed. Also chronic mental distress is a well known driver of physical manifestations and perceived physical health decline. Thus, mentally unhealthy days is a reasonable predictor to expect to have a strong association with physical distress.
# Fit a simple logistic regression:
mod_simple <- glm(phys_distress ~ menthlth_days,
data = brfss_logistic, family = binomial)
# Display the full summary() output
summary(mod_simple)##
## Call:
## glm(formula = phys_distress ~ menthlth_days, family = binomial,
## data = brfss_logistic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.346000 0.042986 -54.58 <2e-16 ***
## menthlth_days 0.073236 0.003181 23.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 5734.5 on 7998 degrees of freedom
## AIC: 5738.5
##
## Number of Fisher Scoring iterations: 5
# Calculate and report the odds ratio and 95% confidence interval
exp(coef(mod_simple)) # Odds ratio## (Intercept) menthlth_days
## 0.09575142 1.07598426
## 2.5 % 97.5 %
## (Intercept) 0.08793573 0.1040772
## menthlth_days 1.06929605 1.0827183
# Wald Test
tidy(mod_simple, conf.int = TRUE, exponentiate = TRUE) |>
mutate(across(c(estimate, std.error, statistic, conf.low, conf.high),
\(x) round(x, 3)),
p.value = format.pval(p.value, digits = 3)) |>
kable(caption = "Wald Tests for Each Coefficient (Exponentiated)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.096 | 0.043 | -54.576 | <2e-16 | 0.088 | 0.104 |
| menthlth_days | 1.076 | 0.003 | 23.020 | <2e-16 | 1.069 | 1.083 |
Interpretation: For every one-unit increase in mentally unhealthy days, the odds of frequent physical distress are multiplied by about 1.076, 95% CI [1.069, 1.083].
Is the association is statistically significant?: Since the p-value is <2e-16 (p<0.001) the association between mentally unhealthy days and frequent physical distress is statistically significant at alpha = 0.05. The Wald Test above highlights this conclusion.
# Multiple logistic regression
mod_logistic <- glm(phys_distress ~ gen_health + menthlth_days + exercise + age_group +
income + depression + education + smoking + sex,
data = brfss_logistic,
family = binomial)
# Display the full summary() output
summary(mod_logistic)##
## Call:
## glm(formula = phys_distress ~ gen_health + menthlth_days + exercise +
## age_group + income + depression + education + smoking + sex,
## family = binomial, data = brfss_logistic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.869370 0.237416 -16.298 < 2e-16 ***
## gen_healthGood 0.883594 0.115593 7.644 2.11e-14 ***
## gen_healthFair/Poor 2.676571 0.113511 23.580 < 2e-16 ***
## menthlth_days 0.044803 0.004351 10.298 < 2e-16 ***
## exerciseYes -0.577926 0.085063 -6.794 1.09e-11 ***
## age_group35-49 0.522042 0.157941 3.305 0.000949 ***
## age_group50-64 0.775775 0.146727 5.287 1.24e-07 ***
## age_group65+ 0.980003 0.144115 6.800 1.05e-11 ***
## income$25K-$49K -0.190466 0.110542 -1.723 0.084887 .
## income$50K-$99K -0.439184 0.111295 -3.946 7.94e-05 ***
## income$100K+ -0.451308 0.220972 -2.042 0.041115 *
## depressionYes 0.258162 0.095637 2.699 0.006946 **
## educationHS/GED 0.034581 0.164741 0.210 0.833738
## educationSome college 0.315308 0.164722 1.914 0.055597 .
## educationCollege graduate 0.266869 0.171000 1.561 0.118610
## smokingCurrent 0.205214 0.115062 1.784 0.074504 .
## smokingFormer 0.199548 0.089583 2.228 0.025913 *
## sexFemale 0.167499 0.080816 2.073 0.038208 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4423.9 on 7982 degrees of freedom
## AIC: 4459.9
##
## Number of Fisher Scoring iterations: 6
# Table of adjusted odds ratios with 95% confidence intervals
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.021 | 0.237 | -16.298 | 0.000 | 0.013 | 0.033 |
| gen_healthGood | 2.420 | 0.116 | 7.644 | 0.000 | 1.933 | 3.042 |
| gen_healthFair/Poor | 14.535 | 0.114 | 23.580 | 0.000 | 11.670 | 18.215 |
| menthlth_days | 1.046 | 0.004 | 10.298 | 0.000 | 1.037 | 1.055 |
| exerciseYes | 0.561 | 0.085 | -6.794 | 0.000 | 0.475 | 0.663 |
| age_group35-49 | 1.685 | 0.158 | 3.305 | 0.001 | 1.240 | 2.304 |
| age_group50-64 | 2.172 | 0.147 | 5.287 | 0.000 | 1.635 | 2.908 |
| age_group65+ | 2.664 | 0.144 | 6.800 | 0.000 | 2.017 | 3.551 |
| income$25K-$49K | 0.827 | 0.111 | -1.723 | 0.085 | 0.666 | 1.027 |
| income$50K-$99K | 0.645 | 0.111 | -3.946 | 0.000 | 0.519 | 0.802 |
| income$100K+ | 0.637 | 0.221 | -2.042 | 0.041 | 0.408 | 0.971 |
| depressionYes | 1.295 | 0.096 | 2.699 | 0.007 | 1.072 | 1.560 |
| educationHS/GED | 1.035 | 0.165 | 0.210 | 0.834 | 0.751 | 1.434 |
| educationSome college | 1.371 | 0.165 | 1.914 | 0.056 | 0.995 | 1.899 |
| educationCollege graduate | 1.306 | 0.171 | 1.561 | 0.119 | 0.937 | 1.832 |
| smokingCurrent | 1.228 | 0.115 | 1.784 | 0.075 | 0.979 | 1.536 |
| smokingFormer | 1.221 | 0.090 | 2.228 | 0.026 | 1.024 | 1.455 |
| sexFemale | 1.182 | 0.081 | 2.073 | 0.038 | 1.009 | 1.386 |
# Or table of adjusted odds ratios with 95% confidence intervals
mod_logistic |>
tbl_regression(
exponentiate = TRUE,
label = list(
gen_health ~ "General health status",
menthlth_days ~ "Mentally unhealthy days (past 30)",
exercise ~ "Any physical activity (past 30 days)",
age_group ~ "Age group",
income ~ "Annual household income",
depression ~ "Ever told depressive disorder",
education ~ "Highest level of education completed",
smoking ~ "Smoking status",
sex ~ "Sex"
)
) |>
bold_labels() |>
bold_p()| Characteristic | OR | 95% CI | p-value |
|---|---|---|---|
| General health status | |||
| Excellent/Very Good | — | — | |
| Good | 2.42 | 1.93, 3.04 | <0.001 |
| Fair/Poor | 14.5 | 11.7, 18.2 | <0.001 |
| Mentally unhealthy days (past 30) | 1.05 | 1.04, 1.05 | <0.001 |
| Any physical activity (past 30 days) | |||
| No | — | — | |
| Yes | 0.56 | 0.48, 0.66 | <0.001 |
| Age group | |||
| 18-34 | — | — | |
| 35-49 | 1.69 | 1.24, 2.30 | <0.001 |
| 50-64 | 2.17 | 1.64, 2.91 | <0.001 |
| 65+ | 2.66 | 2.02, 3.55 | <0.001 |
| Annual household income | |||
| Less than $25K | — | — | |
| $25K-$49K | 0.83 | 0.67, 1.03 | 0.085 |
| $50K-$99K | 0.64 | 0.52, 0.80 | <0.001 |
| $100K+ | 0.64 | 0.41, 0.97 | 0.041 |
| Ever told depressive disorder | |||
| No | — | — | |
| Yes | 1.29 | 1.07, 1.56 | 0.007 |
| Highest level of education completed | |||
| Less than HS | — | — | |
| HS/GED | 1.04 | 0.75, 1.43 | 0.8 |
| Some college | 1.37 | 1.00, 1.90 | 0.056 |
| College graduate | 1.31 | 0.94, 1.83 | 0.12 |
| Smoking status | |||
| Never | — | — | |
| Current | 1.23 | 0.98, 1.54 | 0.075 |
| Former | 1.22 | 1.02, 1.45 | 0.026 |
| Sex | |||
| Male | — | — | |
| Female | 1.18 | 1.01, 1.39 | 0.038 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
Interpretation Mentally Unhealthy Days: For every one day increase in mentally unhealthy days, the odds of experiencing frequent physical distress increase by 5% (adjusted OR = 1.05; 95% CI: 1.04, 1.05), holding all other variables constant. Interpretation Exercise: Individuals who participated in physical activity in the past 30 days have 44% lower odds (adjusted OR = 0.56; 95% CI: 0.48, 0.66) of experiencing frequent physical distress compared to those who did not exercise, holding all other variables constant. Interpretation Depression: The odds of frequent physical distress are 1.29 times higher for individuals ever told they have a depressive disorder compared to those never told they have depression (adjusted OR = 1.29; 95% CI: 1.07, 1.56), holding all other variables constant.
# Reduced model that excludes income
mod_reduced <- glm(phys_distress ~ gen_health + menthlth_days + exercise + age_group + depression + education + smoking + sex,
data = brfss_logistic,
family = binomial)
# likelihood ratio test comparing the reduced and full models
anova(mod_reduced, mod_logistic, test = "LRT") |>
kable(digits = 3,
caption = "LR Test: Does adding income improve the model?") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
|---|---|---|---|---|
| 7985 | 4440.376 | NA | NA | NA |
| 7982 | 4423.888 | 3 | 16.488 | 0.001 |
# likelihood ratio test comparing the reduced and full models (as per assignment instrucions, same results)
anova(mod_reduced, mod_logistic, test = "Chisq") |>
kable(digits = 3,
caption = "Chi Square Test: Does adding income improve the model?") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
|---|---|---|---|---|
| 7985 | 4440.376 | NA | NA | NA |
| 7982 | 4423.888 | 3 | 16.488 | 0.001 |
Null hypothesis: H0 = The dropped predictors (income) have no effect on the model Alternative hypothesis: H1 = The dropped predictors have a significant effect Test Statistic (Deviance): 16.488 Degrees of freedom: 3 p-value: 0.001 Conclusion: Since p-value is less than 0.05 (p = 0.001), we reject the null hypothesis, indicating the income variable significantly improves the model fit.
# Generate a ROC curve for your multiple logistic regression model
roc_obj <- roc(brfss_logistic$phys_distress, fitted(mod_logistic))
plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE)## Area under the curve: 0.8555
## 95% CI: 0.8427-0.8683 (DeLong)
AUC: 0.8555 95% CI: [0.8427, 0.8683] Interpretation: The AUC, 0.8555, indicates the model has excellent ability to discriminate between individuals with and without frequent physical distress.
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.8688, df = 8, p-value = 0.5509
Null hypothesis: H0 = The model fits the data well Alternative hypothesis: H1 = The model does not fit the data well Test Statistic (X-squared): 6.8688 Degrees of freedom: 8 p-value: 0.5509 Conclusion: At alpha = 0.05, we fail to reject the null hypothesis because the p-value (0.5509) is greater than 0.05. There is no evidence of poor model fit, suggesting that the predicted probabilities align well with the observed frequencies of physical distress across the different risk groups. Interpretation: While the ROC/AUC (0.8555) demonstrates that the model has excellent discrimination, the Hosmer-Lemeshow test confirms the model’s calibration. Together, they indicate that the model is not only good at ranking individuals by risk but is also accurate in its absolute probability estimates.
The results of this analysis suggest that physical health burden among U.S. adults is a complex issue driven by a combination of mental health, lifestyle, and socioeconomic factors. In both the linear and logistic models, mentally unhealthy days and general health status emerged as the strongest predictors. Variables like BMI and marital status were significant in the initial maximum linear model but were excluded during automated selection, suggesting their impact is less critical when accounting for factors like depression and income.
A key limitation of this study is the cross-sectional nature of the BRFSS data, which only provides a snapshot in time. Thus we cannot determine temporality. We cannot determine if poor mental health causes physical distress or if chronic physical pain leads to mental health decline. Additionally, the model may potentially be missing key confounders like nutritional habits and environmental stressors, which are known to impact physical health but were not included in this dataset.
For this assignment, I sought AI assistance for troubleshooting package conflicts, specifically the dplyr::select error. When selcting the variables from the dataset I kept getting errors. So I asked Gemini to help identify the problem. I asked “How do I fix the”unused arguments” error in select?” I verified the correctness when the code was able to run sucessfully.
End of HW Assignment 4