Total observations: 433,323 Total rows : 350
brfss_subset <- brfss_raw %>%
select(MENTHLTH, PHYSHLTH, `_BMI5`, SEXVAR, EXERANY2, ADDEPEV3, `_AGEG5YR`, `_INCOMG1`, EDUCA, `_SMOKER3`, GENHLTH, MARITAL)
# Recode variables
brfss_recode <- brfss_subset %>%
mutate(
#menthlth_days
MENTHLTH = if_else(MENTHLTH %in% c(77, 99), NA_real_, MENTHLTH),
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_),
#physhlth_days
PHYSHLTH = if_else(PHYSHLTH %in% c(77, 99), NA_real_, PHYSHLTH),
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_),
#bmi
bmi = `_BMI5` / 100,
bmi = if_else(bmi %in% c(9999), NA_real_, bmi),
#sex
sex = factor(ifelse(SEXVAR == 1, "Male", "Female")),
#exercise
EXERANY2 = if_else(EXERANY2 %in% c(7, 9), NA_real_, EXERANY2),
exercise = factor(ifelse(EXERANY2 == 1, "Yes", "No")),
#depression
ADDEPEV3 = if_else(ADDEPEV3 %in% c(7, 9), NA_real_, ADDEPEV3),
depression = factor(ifelse(ADDEPEV3 == 1, "Yes", "No")),
#age_group
`_AGEG5YR` = if_else(`_AGEG5YR` %in% c(14), NA_real_, `_AGEG5YR`),
age_group = factor(case_when(
`_AGEG5YR` == 1 | `_AGEG5YR` == 2 | `_AGEG5YR` == 3 ~ "18-34",
`_AGEG5YR` == 4 | `_AGEG5YR` == 5 | `_AGEG5YR` == 6 ~ "35-49",
`_AGEG5YR` == 7 | `_AGEG5YR` == 8 | `_AGEG5YR` == 9 ~ "50-64",
`_AGEG5YR` == 10| `_AGEG5YR` == 11| `_AGEG5YR` == 12 | `_AGEG5YR` == 13 ~ "65+"),
levels = c("18-34", "35-49", "50-64", "65+")),
#income
`_INCOMG1` = if_else(`_INCOMG1` %in% c(9), NA_real_, `_INCOMG1`),
income = factor(case_when(
`_INCOMG1` == 1 | `_INCOMG1` == 2 ~ "< $25,000",
`_INCOMG1` == 3 | `_INCOMG1` == 4 ~ "$25,000 to < $50,000",
`_INCOMG1` == 5 | `_INCOMG1` == 6 ~ "$50,000 to < $100,000",
`_INCOMG1` == 7 ~ "$100,000 or more"),
levels = c("< $25,000", "$25,000 to < $50,000", "$50,000 to < $100,000",
"$100,000 or more")),
#education
EDUCA = if_else(EDUCA %in% c(9), NA_real_, EDUCA),
education = factor(case_when(
EDUCA == 1 | EDUCA == 2 | EDUCA == 3 ~ "< High school",
EDUCA == 4 ~ "High school graduate or GED",
EDUCA == 5 ~ "Some college or technical school",
EDUCA == 6 ~ "College graduate"),
levels = c("< High school", "High school graduate or GED",
"Some college or technical school", "College graduate")),
#smoking
`_SMOKER3` = if_else(`_SMOKER3` %in% c(9), NA_real_, `_SMOKER3`),
smoking = factor(case_when(
`_SMOKER3` == 1 | `_SMOKER3` == 2 ~ "Current",
`_SMOKER3` == 3 ~ "Former",
`_SMOKER3` == 4 ~ "Never"),
levels = c("Current", "Former", "Never")),
#gen_health
GENHLTH = if_else(GENHLTH %in% c(7,9), NA_real_, GENHLTH),
gen_health = factor(case_when(
GENHLTH == 1 | GENHLTH == 2 ~ "Excellent/Very Good",
GENHLTH == 3 ~ "Good",
GENHLTH == 4 | GENHLTH == 5 ~ "Fair/Poor"),
levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
#marital
MARITAL = if_else(MARITAL %in% c(9), NA_real_, MARITAL),
marital = factor(case_when(
MARITAL == 1 | MARITAL == 6 ~ "Married/Partnered",
MARITAL == 2 | MARITAL == 3 | MARITAL == 4 ~ "Previously married",
MARITAL == 5 ~ "Never Married"),
levels = c("Married/Partnered", "Previously married", "Never Married")))
brfss_clean <- brfss_recode %>%
select(menthlth_days, physhlth_days, gen_health, bmi, sex, exercise, depression, income, education)
The 9 variables selected for this assignment were: mental health days, physical health days, general health days, bmi, sex, exercise, depression, income, and education. Mental and physical health days, as well as health days, are continuous. BMI, exercise, income, and education are categorical. Depression and sex are binary. I selected these variables because they are relevant social determinants of health in public health research.
brfss_clean$sex <- relevel(brfss_clean$sex, ref = "Male")
brfss_clean$exercise <- relevel(brfss_clean$exercise, ref = "No")
brfss_clean$depression <- relevel(brfss_clean$depression, ref = "No")
brfss_clean$income <- relevel(brfss_clean$income, ref = "< $25,000")
brfss_clean$education <- relevel(brfss_clean$education, ref = "< High school")
Report the number and percentage of missing observations for menthlth_days, physhlth_days, income, and education
mean(is.na(brfss_clean$menthlth_days)) * 100 # 1.87%
## [1] 1.871122
mean(is.na(brfss_clean$physhlth_days)) * 100 # 2.49%
## [1] 2.488906
mean(is.na(brfss_clean$income)) * 100 # 19.99%
## [1] 19.9904
mean(is.na(brfss_clean$education)) * 100 # 0.54%
## [1] 0.5365513
brfss_analytic <- brfss_clean |>
drop_na() |>
slice_sample(n = 8000)
n=433,323 with 9 variables before deletion n=317,142 with 9 variables after listwise deletion n-8,000 with 9 variables after random seed of 8,000
brfss_analytic %>%
tbl_summary(
include = c(menthlth_days, physhlth_days, gen_health, sex, bmi, exercise, depression, income, education),
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
gen_health ~ "General health status",
sex ~ "Sex",
bmi ~ "Body mass index",
income ~ "Annual household income",
depression ~ "Ever told depressive disorder",
exercise ~ "Any physical activity (past 30 days)",
education ~ "Highest level of education completed"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) %>%
add_n() %>%
# add_overall() %>%
bold_labels() %>%
#italicize_levels() %>%
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**") %>%
as_flex_table()
Characteristic | N | N = 8,0001 |
|---|---|---|
Mentally unhealthy days (past 30) | 8,000 | 4.4 (8.4) |
Physically unhealthy days (past 30) | 8,000 | 4.4 (8.8) |
General health status | 8,000 | |
Excellent/Very Good | 3,929 (49%) | |
Good | 2,610 (33%) | |
Fair/Poor | 1,461 (18%) | |
Sex | 8,000 | |
Male | 3,935 (49%) | |
Female | 4,065 (51%) | |
Body mass index | 8,000 | 28.6 (6.6) |
Any physical activity (past 30 days) | 8,000 | 6,193 (77%) |
Ever told depressive disorder | 8,000 | 1,699 (21%) |
Annual household income | 8,000 | |
< $25,000 | 1,077 (13%) | |
$25,000 to < $50,000 | 1,980 (25%) | |
$50,000 to < $100,000 | 4,336 (54%) | |
$100,000 or more | 607 (7.6%) | |
Highest level of education completed | 8,000 | |
< High school | 409 (5.1%) | |
High school graduate or GED | 1,870 (23%) | |
Some college or technical school | 2,146 (27%) | |
College graduate | 3,575 (45%) | |
1Mean (SD); n (%) | ||
max_model <- lm(physhlth_days ~ menthlth_days + gen_health + bmi + sex + exercise + depression + income + education,
data = brfss_analytic)
tidy(max_model, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Maximum Model: All Candidate Predictors",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 3.3877 | 0.5527 | 6.1297 | 0.0000 | 2.3043 | 4.4711 |
| menthlth_days | 0.1900 | 0.0108 | 17.6715 | 0.0000 | 0.1689 | 0.2111 |
| gen_healthGood | 1.2213 | 0.1853 | 6.5893 | 0.0000 | 0.8579 | 1.5846 |
| gen_healthFair/Poor | 10.9653 | 0.2467 | 44.4435 | 0.0000 | 10.4816 | 11.4489 |
| bmi | -0.0362 | 0.0125 | -2.8990 | 0.0038 | -0.0607 | -0.0117 |
| sexFemale | 0.0085 | 0.1619 | 0.0528 | 0.9579 | -0.3087 | 0.3258 |
| exerciseYes | -2.3096 | 0.2029 | -11.3829 | 0.0000 | -2.7074 | -1.9119 |
| depressionYes | 0.4683 | 0.2170 | 2.1576 | 0.0310 | 0.0428 | 0.8937 |
| income$25,000 to < $50,000 | -0.7976 | 0.2765 | -2.8843 | 0.0039 | -1.3397 | -0.2555 |
| income$50,000 to < $100,000 | -0.9094 | 0.2689 | -3.3814 | 0.0007 | -1.4366 | -0.3822 |
| income$100,000 or more | -0.8343 | 0.3937 | -2.1191 | 0.0341 | -1.6061 | -0.0625 |
| educationHigh school graduate or GED | 0.9845 | 0.3955 | 2.4890 | 0.0128 | 0.2091 | 1.7598 |
| educationSome college or technical school | 1.5535 | 0.3986 | 3.8976 | 0.0001 | 0.7722 | 2.3349 |
| educationCollege graduate | 1.4388 | 0.3993 | 3.6029 | 0.0003 | 0.6560 | 2.2216 |
summary(max_model)
##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + gen_health + bmi +
## sex + exercise + depression + income + education, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.1900 -2.6193 -0.8945 0.3422 30.0757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.38773 0.55268 6.130 9.22e-10
## menthlth_days 0.19000 0.01075 17.672 < 2e-16
## gen_healthGood 1.22125 0.18534 6.589 4.70e-11
## gen_healthFair/Poor 10.96528 0.24672 44.443 < 2e-16
## bmi -0.03619 0.01248 -2.899 0.003753
## sexFemale 0.00854 0.16186 0.053 0.957921
## exerciseYes -2.30962 0.20290 -11.383 < 2e-16
## depressionYes 0.46827 0.21704 2.158 0.030992
## income$25,000 to < $50,000 -0.79765 0.27654 -2.884 0.003933
## income$50,000 to < $100,000 -0.90938 0.26894 -3.381 0.000725
## income$100,000 or more -0.83433 0.39373 -2.119 0.034116
## educationHigh school graduate or GED 0.98446 0.39552 2.489 0.012829
## educationSome college or technical school 1.55354 0.39859 3.898 9.80e-05
## educationCollege graduate 1.43878 0.39933 3.603 0.000317
##
## (Intercept) ***
## menthlth_days ***
## gen_healthGood ***
## gen_healthFair/Poor ***
## bmi **
## sexFemale
## exerciseYes ***
## depressionYes *
## income$25,000 to < $50,000 **
## income$50,000 to < $100,000 ***
## income$100,000 or more *
## educationHigh school graduate or GED *
## educationSome college or technical school ***
## educationCollege graduate ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.09 on 7986 degrees of freedom
## Multiple R-squared: 0.3493, Adjusted R-squared: 0.3482
## F-statistic: 329.8 on 13 and 7986 DF, p-value: < 2.2e-16
AIC(max_model)
## [1] 54057.19
BIC(max_model)
## [1] 54162
R-squared: 0.325. This represents a weak linear association between physical health days and our model predictors. This value describes how well the multiple linear regression model fits.
Adjusted R-Squared: 0.324. This represents a weak linear association between physical health days and our model predictors. This value describes how well the multiple linear regression model fits, adjusting for the number of covariates.
AIC: 53,866.58. A lower AIC represents a better fitting model.
BIC: 53,761.77. A lower BIC represents a better fitting model, penalizing for model complexity.
best_subsets <- regsubsets(
physhlth_days ~ menthlth_days + gen_health + bmi + sex + exercise + depression + income + education,
data = brfss_analytic,
nvmax = 9, # maximum number of variables to consider
method = "exhaustive"
)
best_summary <- summary(best_subsets)
subset_metrics <- tibble(
p = 1:length(best_summary$adjr2),
`Adj. R²` = best_summary$adjr2,
BIC = best_summary$bic,
Cp = best_summary$cp
)
p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.max(best_summary$adjr2),
linetype = "dashed", color = "tomato") +
labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
theme_minimal(base_size = 12)
p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.min(best_summary$bic),
linetype = "dashed", color = "tomato") +
labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
theme_minimal(base_size = 12)
gridExtra::grid.arrange(p1, p2, ncol = 2)
Adjusted R-squared is maximized around 5 variables. In addition, BIC is
minimized around 5 variables. Both criteria agree on the ideal model
size, which is around 5 variables.
pvals <- tidy(max_model) |>
filter(term != "(Intercept)") |>
arrange(desc(p.value)) |>
dplyr::select(term, estimate, p.value) |>
mutate(across(where(is.numeric), \(x) round(x, 4)))
pvals |>
head(5) |>
kable(caption = "Maximum Model: Variables Sorted by p-value (Highest First)") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| term | estimate | p.value |
|---|---|---|
| sexFemale | 0.0085 | 0.9579 |
| income$100,000 or more | -0.8343 | 0.0341 |
| depressionYes | 0.4683 | 0.0310 |
| educationHigh school graduate or GED | 0.9845 | 0.0128 |
| income$25,000 to < $50,000 | -0.7976 | 0.0039 |
mod_backward <- step(max_model, direction = "backward", trace = 1)
## Start: AIC=31352.17
## physhlth_days ~ menthlth_days + gen_health + bmi + sex + exercise +
## depression + income + education
##
## Df Sum of Sq RSS AIC
## - sex 1 0 401402 31350
## <none> 401402 31352
## - depression 1 234 401636 31355
## - income 3 594 401996 31358
## - bmi 1 422 401824 31359
## - education 3 930 402332 31365
## - exercise 1 6513 407914 31479
## - menthlth_days 1 15696 417098 31657
## - gen_health 2 106438 507840 33230
##
## Step: AIC=31350.17
## physhlth_days ~ menthlth_days + gen_health + bmi + exercise +
## depression + income + education
##
## Df Sum of Sq RSS AIC
## <none> 401402 31350
## - depression 1 238 401640 31353
## - income 3 600 402002 31356
## - bmi 1 423 401825 31357
## - education 3 937 402339 31363
## - exercise 1 6530 407932 31477
## - menthlth_days 1 15726 417128 31656
## - gen_health 2 106769 508171 33233
tidy(mod_backward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Backward Elimination Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 3.3925 | 0.5450 | 6.2243 | 0.0000 | 2.3241 | 4.4610 |
| menthlth_days | 0.1900 | 0.0107 | 17.6893 | 0.0000 | 0.1690 | 0.2111 |
| gen_healthGood | 1.2210 | 0.1853 | 6.5904 | 0.0000 | 0.8578 | 1.5842 |
| gen_healthFair/Poor | 10.9645 | 0.2463 | 44.5214 | 0.0000 | 10.4817 | 11.4473 |
| bmi | -0.0362 | 0.0125 | -2.9006 | 0.0037 | -0.0607 | -0.0117 |
| exerciseYes | -2.3101 | 0.2027 | -11.3990 | 0.0000 | -2.7074 | -1.9129 |
| depressionYes | 0.4696 | 0.2156 | 2.1776 | 0.0295 | 0.0469 | 0.8923 |
| income$25,000 to < $50,000 | -0.7982 | 0.2763 | -2.8887 | 0.0039 | -1.3399 | -0.2566 |
| income$50,000 to < $100,000 | -0.9107 | 0.2678 | -3.4006 | 0.0007 | -1.4356 | -0.3857 |
| income$100,000 or more | -0.8363 | 0.3919 | -2.1338 | 0.0329 | -1.6046 | -0.0680 |
| educationHigh school graduate or GED | 0.9851 | 0.3953 | 2.4922 | 0.0127 | 0.2103 | 1.7600 |
| educationSome college or technical school | 1.5547 | 0.3980 | 3.9063 | 0.0001 | 0.7745 | 2.3348 |
| educationCollege graduate | 1.4402 | 0.3984 | 3.6150 | 0.0003 | 0.6592 | 2.2212 |
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)
mod_forward <- step(mod_null,
scope = list(lower = mod_null, upper = max_model),
direction = "forward", trace = 1)
## Start: AIC=34763.88
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 185566 431316 31905
## + menthlth_days 1 71079 545803 33787
## + exercise 1 38727 578154 34247
## + income 3 25956 590926 34426
## + depression 1 24327 592555 34444
## + education 3 8319 608563 34661
## + bmi 1 5788 611093 34690
## + sex 1 606 616275 34758
## <none> 616882 34764
##
## Step: AIC=31905.19
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 21705.0 409611 31494
## + exercise 1 7318.6 423997 31770
## + depression 1 4878.8 426437 31816
## + income 3 1525.8 429790 31883
## + sex 1 573.7 430742 31897
## + education 3 447.7 430868 31903
## <none> 431316 31905
## + bmi 1 46.0 431270 31906
##
## Step: AIC=31494.13
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 6252.9 403358 31373
## + income 3 773.4 408837 31485
## + depression 1 216.1 409395 31492
## + bmi 1 146.8 409464 31493
## <none> 409611 31494
## + sex 1 93.2 409518 31494
## + education 3 282.1 409329 31495
##
## Step: AIC=31373.06
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + education 3 666.11 402692 31366
## + bmi 1 411.25 402947 31367
## + depression 1 263.48 403094 31370
## + income 3 336.32 403022 31372
## <none> 403358 31373
## + sex 1 33.79 403324 31374
##
## Step: AIC=31365.84
## physhlth_days ~ gen_health + menthlth_days + exercise + education
##
## Df Sum of Sq RSS AIC
## + income 3 667.77 402024 31359
## + bmi 1 423.97 402268 31359
## + depression 1 223.47 402468 31363
## <none> 402692 31366
## + sex 1 20.76 402671 31367
##
## Step: AIC=31358.56
## physhlth_days ~ gen_health + menthlth_days + exercise + education +
## income
##
## Df Sum of Sq RSS AIC
## + bmi 1 383.70 401640 31353
## + depression 1 199.16 401825 31357
## <none> 402024 31359
## + sex 1 5.40 402019 31360
##
## Step: AIC=31352.92
## physhlth_days ~ gen_health + menthlth_days + exercise + education +
## income + bmi
##
## Df Sum of Sq RSS AIC
## + depression 1 238.312 401402 31350
## <none> 401640 31353
## + sex 1 4.476 401636 31355
##
## Step: AIC=31350.17
## physhlth_days ~ gen_health + menthlth_days + exercise + education +
## income + bmi + depression
##
## Df Sum of Sq RSS AIC
## <none> 401402 31350
## + sex 1 0.13994 401402 31352
tidy(mod_forward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Forward Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 3.3925 | 0.5450 | 6.2243 | 0.0000 | 2.3241 | 4.4610 |
| gen_healthGood | 1.2210 | 0.1853 | 6.5904 | 0.0000 | 0.8578 | 1.5842 |
| gen_healthFair/Poor | 10.9645 | 0.2463 | 44.5214 | 0.0000 | 10.4817 | 11.4473 |
| menthlth_days | 0.1900 | 0.0107 | 17.6893 | 0.0000 | 0.1690 | 0.2111 |
| exerciseYes | -2.3101 | 0.2027 | -11.3990 | 0.0000 | -2.7074 | -1.9129 |
| educationHigh school graduate or GED | 0.9851 | 0.3953 | 2.4922 | 0.0127 | 0.2103 | 1.7600 |
| educationSome college or technical school | 1.5547 | 0.3980 | 3.9063 | 0.0001 | 0.7745 | 2.3348 |
| educationCollege graduate | 1.4402 | 0.3984 | 3.6150 | 0.0003 | 0.6592 | 2.2212 |
| income$25,000 to < $50,000 | -0.7982 | 0.2763 | -2.8887 | 0.0039 | -1.3399 | -0.2566 |
| income$50,000 to < $100,000 | -0.9107 | 0.2678 | -3.4006 | 0.0007 | -1.4356 | -0.3857 |
| income$100,000 or more | -0.8363 | 0.3919 | -2.1338 | 0.0329 | -1.6046 | -0.0680 |
| bmi | -0.0362 | 0.0125 | -2.9006 | 0.0037 | -0.0607 | -0.0117 |
| depressionYes | 0.4696 | 0.2156 | 2.1776 | 0.0295 | 0.0469 | 0.8923 |
mod_stepwise <- step(mod_null,
scope = list(lower = mod_null, upper = max_model),
direction = "both", trace = 1)
## Start: AIC=34763.88
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 185566 431316 31905
## + menthlth_days 1 71079 545803 33787
## + exercise 1 38727 578154 34247
## + income 3 25956 590926 34426
## + depression 1 24327 592555 34444
## + education 3 8319 608563 34661
## + bmi 1 5788 611093 34690
## + sex 1 606 616275 34758
## <none> 616882 34764
##
## Step: AIC=31905.19
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 21705 409611 31494
## + exercise 1 7319 423997 31770
## + depression 1 4879 426437 31816
## + income 3 1526 429790 31883
## + sex 1 574 430742 31897
## + education 3 448 430868 31903
## <none> 431316 31905
## + bmi 1 46 431270 31906
## - gen_health 2 185566 616882 34764
##
## Step: AIC=31494.13
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 6253 403358 31373
## + income 3 773 408837 31485
## + depression 1 216 409395 31492
## + bmi 1 147 409464 31493
## <none> 409611 31494
## + sex 1 93 409518 31494
## + education 3 282 409329 31495
## - menthlth_days 1 21705 431316 31905
## - gen_health 2 136192 545803 33787
##
## Step: AIC=31373.06
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + education 3 666 402692 31366
## + bmi 1 411 402947 31367
## + depression 1 263 403094 31370
## + income 3 336 403022 31372
## <none> 403358 31373
## + sex 1 34 403324 31374
## - exercise 1 6253 409611 31494
## - menthlth_days 1 20639 423997 31770
## - gen_health 2 115129 518487 33378
##
## Step: AIC=31365.84
## physhlth_days ~ gen_health + menthlth_days + exercise + education
##
## Df Sum of Sq RSS AIC
## + income 3 668 402024 31359
## + bmi 1 424 402268 31359
## + depression 1 223 402468 31363
## <none> 402692 31366
## + sex 1 21 402671 31367
## - education 3 666 403358 31373
## - exercise 1 6637 409329 31495
## - menthlth_days 1 20555 423247 31762
## - gen_health 2 114677 517368 33367
##
## Step: AIC=31358.56
## physhlth_days ~ gen_health + menthlth_days + exercise + education +
## income
##
## Df Sum of Sq RSS AIC
## + bmi 1 384 401640 31353
## + depression 1 199 401825 31357
## <none> 402024 31359
## + sex 1 5 402019 31360
## - income 3 668 402692 31366
## - education 3 998 403022 31372
## - exercise 1 6237 408261 31480
## - menthlth_days 1 19966 421989 31744
## - gen_health 2 108499 510523 33266
##
## Step: AIC=31352.92
## physhlth_days ~ gen_health + menthlth_days + exercise + education +
## income + bmi
##
## Df Sum of Sq RSS AIC
## + depression 1 238 401402 31350
## <none> 401640 31353
## + sex 1 4 401636 31355
## - bmi 1 384 402024 31359
## - income 3 627 402268 31359
## - education 3 994 402634 31367
## - exercise 1 6484 408125 31479
## - menthlth_days 1 20121 421762 31742
## - gen_health 2 108268 509908 33258
##
## Step: AIC=31350.17
## physhlth_days ~ gen_health + menthlth_days + exercise + education +
## income + bmi + depression
##
## Df Sum of Sq RSS AIC
## <none> 401402 31350
## + sex 1 0 401402 31352
## - depression 1 238 401640 31353
## - income 3 600 402002 31356
## - bmi 1 423 401825 31357
## - education 3 937 402339 31363
## - exercise 1 6530 407932 31477
## - menthlth_days 1 15726 417128 31656
## - gen_health 2 106769 508171 33233
tidy(mod_stepwise, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Stepwise Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 3.3925 | 0.5450 | 6.2243 | 0.0000 | 2.3241 | 4.4610 |
| gen_healthGood | 1.2210 | 0.1853 | 6.5904 | 0.0000 | 0.8578 | 1.5842 |
| gen_healthFair/Poor | 10.9645 | 0.2463 | 44.5214 | 0.0000 | 10.4817 | 11.4473 |
| menthlth_days | 0.1900 | 0.0107 | 17.6893 | 0.0000 | 0.1690 | 0.2111 |
| exerciseYes | -2.3101 | 0.2027 | -11.3990 | 0.0000 | -2.7074 | -1.9129 |
| educationHigh school graduate or GED | 0.9851 | 0.3953 | 2.4922 | 0.0127 | 0.2103 | 1.7600 |
| educationSome college or technical school | 1.5547 | 0.3980 | 3.9063 | 0.0001 | 0.7745 | 2.3348 |
| educationCollege graduate | 1.4402 | 0.3984 | 3.6150 | 0.0003 | 0.6592 | 2.2212 |
| income$25,000 to < $50,000 | -0.7982 | 0.2763 | -2.8887 | 0.0039 | -1.3399 | -0.2566 |
| income$50,000 to < $100,000 | -0.9107 | 0.2678 | -3.4006 | 0.0007 | -1.4356 | -0.3857 |
| income$100,000 or more | -0.8363 | 0.3919 | -2.1338 | 0.0329 | -1.6046 | -0.0680 |
| bmi | -0.0362 | 0.0125 | -2.9006 | 0.0037 | -0.0607 | -0.0117 |
| depressionYes | 0.4696 | 0.2156 | 2.1776 | 0.0295 | 0.0469 | 0.8923 |
method_comparison <- tribble(
~Method, ~`Variables selected`, ~`Adj. R²`, ~AIC, ~BIC,
"Maximum model",
length(coef(max_model)) - 1,
round(glance(max_model)$adj.r.squared, 4),
round(AIC(max_model), 1),
round(BIC(max_model), 1),
"Backward (AIC)",
length(coef(mod_backward)) - 1,
round(glance(mod_backward)$adj.r.squared, 4),
round(AIC(mod_backward), 1),
round(BIC(mod_backward), 1),
"Forward (AIC)",
length(coef(mod_forward)) - 1,
round(glance(mod_forward)$adj.r.squared, 4),
round(AIC(mod_forward), 1),
round(BIC(mod_forward), 1),
"Stepwise (AIC)",
length(coef(mod_stepwise)) - 1,
round(glance(mod_stepwise)$adj.r.squared, 4),
round(AIC(mod_stepwise), 1),
round(BIC(mod_stepwise), 1)
)
method_comparison |>
kable(caption = "Comparison of Variable Selection Methods") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Method | Variables selected | Adj. R² | AIC | BIC |
|---|---|---|---|---|
| Maximum model | 13 | 0.3482 | 54057.2 | 54162 |
| Backward (AIC) | 12 | 0.3483 | 54055.2 | 54153 |
| Forward (AIC) | 12 | 0.3483 | 54055.2 | 54153 |
| Stepwise (AIC) | 12 | 0.3483 | 54055.2 | 54153 |
All three methods of stepwise model selection remove the sex variable. The maximum model includes the sex variable and has a slightly lower adjusted R-squared value and slighlty higher AIC and BIC, as compared to the backward, forward, and stepwise models which were all identical. These identical models included the following variables: mental health days, general health days, bmi, exercise, depression, income, and education.
tidy(max_model, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Maximum Model: All Candidate Predictors",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 3.3877 | 0.5527 | 6.1297 | 0.0000 | 2.3043 | 4.4711 |
| menthlth_days | 0.1900 | 0.0108 | 17.6715 | 0.0000 | 0.1689 | 0.2111 |
| gen_healthGood | 1.2213 | 0.1853 | 6.5893 | 0.0000 | 0.8579 | 1.5846 |
| gen_healthFair/Poor | 10.9653 | 0.2467 | 44.4435 | 0.0000 | 10.4816 | 11.4489 |
| bmi | -0.0362 | 0.0125 | -2.8990 | 0.0038 | -0.0607 | -0.0117 |
| sexFemale | 0.0085 | 0.1619 | 0.0528 | 0.9579 | -0.3087 | 0.3258 |
| exerciseYes | -2.3096 | 0.2029 | -11.3829 | 0.0000 | -2.7074 | -1.9119 |
| depressionYes | 0.4683 | 0.2170 | 2.1576 | 0.0310 | 0.0428 | 0.8937 |
| income$25,000 to < $50,000 | -0.7976 | 0.2765 | -2.8843 | 0.0039 | -1.3397 | -0.2555 |
| income$50,000 to < $100,000 | -0.9094 | 0.2689 | -3.3814 | 0.0007 | -1.4366 | -0.3822 |
| income$100,000 or more | -0.8343 | 0.3937 | -2.1191 | 0.0341 | -1.6061 | -0.0625 |
| educationHigh school graduate or GED | 0.9845 | 0.3955 | 2.4890 | 0.0128 | 0.2091 | 1.7598 |
| educationSome college or technical school | 1.5535 | 0.3986 | 3.8976 | 0.0001 | 0.7722 | 2.3349 |
| educationCollege graduate | 1.4388 | 0.3993 | 3.6029 | 0.0003 | 0.6560 | 2.2216 |
I decided to select the maximum model for this analysis because sex is an important variable when considering public health. This goes against the consensus decision from the stepwise selection methods. I think it’s important to build models based on previous public health research and models that attempt best understand the complexity of social and behavioral characteristics.
menthlth_days (continuous predictor) - Every additional day with poor physical health was associated with 0.16 more days of poor physical health, adjusting for other variables.
bmi (continuous predictor) - Every additional one unit increase in BMI was associated with 0.01 more days of poor physical health, adjusting for other variables.
gen_health: (categorical) - Those with fair to poor overall health had 10.32 more days of poor physical health compared to those with very good to excellent general health, adjusting for other variables.
par(mfrow = c(2, 2))
plot(max_model)
par(mfrow = c(1, 1))
Residuals vs. Fitted: Is there evidence of non-linearity or non-constant variance? This test checks linearity and equal variance. We want a horizontal red line and random scatter with no pattern. There is no fan shape to the residuals; however, there does appear to be a downward sloping pattern in the residuals instead of a random scatter so there is evidence for heteroscedasticity.
Normal Q-Q: Do the residuals approximately follow a normal distribution? Where do they deviate? This test checks normality of residuals. Since there appears to be an S-curve there is evidence to suggest non-normality.
Scale-Location: Is the spread of residuals roughly constant across fitted values? This test checks for equal variance (homoscedasticity). Since a flat line indicates constant variance we can conclude that ther is evidence of heteroscedasticity since the line bends up from the fitted values closer to 0.
Residuals vs. Leverage: Are there any highly influential observations (large Cook’s distance)? This test identifies influential observations using Cook’s distance. There are approximately 3 influential points in the upper right corner.
The line assumptions are reasonably satisfied. Even though there are violations seen in these four graphs, the large sample size of our dataset is protective.
brfss_analytic_logistic <- brfss_analytic %>%
mutate(physhlth_days_binary = factor(ifelse(physhlth_days >= 14, "Yes", "No")))
brfss_analytic_logistic$physhlth_days_binary <- relevel(brfss_analytic_logistic$physhlth_days_binary, ref = "No")
tibble(Metric = c("Observations", "Number with Physical Distress", "Prevalence of Those Without Physical Distress", "Number with Physical Distress", "Prevalence of Physical Distress"),
Value = c(nrow(brfss_analytic_logistic),
sum(brfss_analytic_logistic$physhlth_days_binary == "No"),
paste0(round(100 * mean(brfss_analytic_logistic$physhlth_days_binary == "No"), 1), "%"),
sum(brfss_analytic_logistic$physhlth_days_binary == "Yes"),
paste0(round(100 * mean(brfss_analytic_logistic$physhlth_days_binary == "Yes"), 1), "%"))) |>
kable(caption = "Analytic Dataset Overview") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Metric | Value |
|---|---|
| Observations | 8000 |
| Number with Physical Distress | 6908 |
| Prevalence of Those Without Physical Distress | 86.4% |
| Number with Physical Distress | 1092 |
| Prevalence of Physical Distress | 13.7% |
This outcome is heavily skewed toward those without frequent physical distress. Those who experience frequent physical distress make up 13.1% of the sample.
mod_simple <- glm(physhlth_days_binary ~ exercise,
data = brfss_analytic_logistic, family = binomial)
tidy(mod_simple, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Simple Logistic Regression: Physical Distress ~ Exercise (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.393 | 0.052 | -17.858 | 0 | 0.355 | 0.435 |
| exerciseYes | 0.264 | 0.068 | -19.588 | 0 | 0.231 | 0.301 |
exp(coef(mod_simple)) # Odds ratio
## (Intercept) exerciseYes
## 0.3932151 0.2637865
exp(confint(mod_simple)) # 95% CI for OR
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.3546492 0.4353099
## exerciseYes 0.2308622 0.3014338
I selected exercise as a predictor of physical health. This has a logical connection as exercise is required for most adults to have good physical health. In addition, the linear model showed that exercise was associated with physical health, with those who exercise experiencing 1.9 fewer days of poor physical health.
The OR for those who exercise is 0.273. Those who exercise have 0.273 [0.239, 0.313] times the odds of experiencing frequent physical distress, as compared to those who do not exercise. This is a significant association based on the confidence interval no including 1.0.
mod_logistic <- glm(physhlth_days_binary ~ exercise + menthlth_days + gen_health + bmi + sex + depression + income + education,
data = brfss_analytic_logistic, family = binomial)
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Multiple Logistic Regression: Physical Distress ~ Exercise and Other Variables (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.080 | 0.238 | -10.583 | 0.000 | 0.050 | 0.128 |
| exerciseYes | 0.481 | 0.084 | -8.683 | 0.000 | 0.408 | 0.568 |
| menthlth_days | 1.044 | 0.004 | 10.627 | 0.000 | 1.036 | 1.052 |
| gen_healthGood | 2.209 | 0.116 | 6.848 | 0.000 | 1.763 | 2.777 |
| gen_healthFair/Poor | 18.112 | 0.112 | 25.860 | 0.000 | 14.583 | 22.628 |
| bmi | 0.983 | 0.005 | -3.121 | 0.002 | 0.973 | 0.994 |
| sexFemale | 1.051 | 0.080 | 0.617 | 0.537 | 0.898 | 1.229 |
| depressionYes | 1.214 | 0.094 | 2.053 | 0.040 | 1.008 | 1.459 |
| income$25,000 to < $50,000 | 0.768 | 0.113 | -2.349 | 0.019 | 0.616 | 0.957 |
| income$50,000 to < $100,000 | 0.706 | 0.113 | -3.089 | 0.002 | 0.566 | 0.881 |
| income$100,000 or more | 0.681 | 0.217 | -1.767 | 0.077 | 0.441 | 1.033 |
| educationHigh school graduate or GED | 1.192 | 0.164 | 1.073 | 0.283 | 0.867 | 1.649 |
| educationSome college or technical school | 1.539 | 0.166 | 2.591 | 0.010 | 1.114 | 2.139 |
| educationCollege graduate | 1.401 | 0.171 | 1.973 | 0.049 | 1.005 | 1.964 |
exp(coef(mod_logistic)) # Odds ratio
## (Intercept)
## 0.08019499
## exerciseYes
## 0.48107871
## menthlth_days
## 1.04407230
## gen_healthGood
## 2.20903134
## gen_healthFair/Poor
## 18.11182672
## bmi
## 0.98344257
## sexFemale
## 1.05066085
## depressionYes
## 1.21370477
## income$25,000 to < $50,000
## 0.76774789
## income$50,000 to < $100,000
## 0.70560707
## income$100,000 or more
## 0.68146363
## educationHigh school graduate or GED
## 1.19230037
## educationSome college or technical school
## 1.53932299
## educationCollege graduate
## 1.40062604
exp(confint(mod_logistic)) # 95% CI for OR
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.05014431 0.1277143
## exerciseYes 0.40791318 0.5676309
## menthlth_days 1.03579971 1.0524129
## gen_healthGood 1.76331638 2.7765790
## gen_healthFair/Poor 14.58328643 22.6278988
## bmi 0.97313001 0.9937654
## sexFemale 0.89804522 1.2293807
## depressionYes 1.00782270 1.4590308
## income$25,000 to < $50,000 0.61586125 0.9574224
## income$50,000 to < $100,000 0.56590084 0.8810241
## income$100,000 or more 0.44065329 1.0331851
## educationHigh school graduate or GED 0.86680614 1.6486962
## educationSome college or technical school 1.11376685 2.1394815
## educationCollege graduate 1.00492406 1.9635539
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.080 | 0.238 | -10.583 | 0.000 | 0.050 | 0.128 |
| exerciseYes | 0.481 | 0.084 | -8.683 | 0.000 | 0.408 | 0.568 |
| menthlth_days | 1.044 | 0.004 | 10.627 | 0.000 | 1.036 | 1.052 |
| gen_healthGood | 2.209 | 0.116 | 6.848 | 0.000 | 1.763 | 2.777 |
| gen_healthFair/Poor | 18.112 | 0.112 | 25.860 | 0.000 | 14.583 | 22.628 |
| bmi | 0.983 | 0.005 | -3.121 | 0.002 | 0.973 | 0.994 |
| sexFemale | 1.051 | 0.080 | 0.617 | 0.537 | 0.898 | 1.229 |
| depressionYes | 1.214 | 0.094 | 2.053 | 0.040 | 1.008 | 1.459 |
| income$25,000 to < $50,000 | 0.768 | 0.113 | -2.349 | 0.019 | 0.616 | 0.957 |
| income$50,000 to < $100,000 | 0.706 | 0.113 | -3.089 | 0.002 | 0.566 | 0.881 |
| income$100,000 or more | 0.681 | 0.217 | -1.767 | 0.077 | 0.441 | 1.033 |
| educationHigh school graduate or GED | 1.192 | 0.164 | 1.073 | 0.283 | 0.867 | 1.649 |
| educationSome college or technical school | 1.539 | 0.166 | 2.591 | 0.010 | 1.114 | 2.139 |
| educationCollege graduate | 1.401 | 0.171 | 1.973 | 0.049 | 1.005 | 1.964 |
Exercise (binary predictor) - The OR for those who exercise is 0.518. Those who exercise have 0.273 [0.439, 0.611] times the odds of experiencing frequent physical distress, as compared to those who do not exercise and holding all other variables constant. This is a significant association based on the confidence interval no including 1.0.
Mental health days (continuous predictor) - The OR for mental health days is 1.040. For every one-unit increase in days with poor mental health, the odds of frequent physical distress are multiplied by 1.040 [1.032, 1.049], holding all other variables constant. This is a significant association based on the confidence interval no including 1.0.
General health days (categorical predictor) - The OR for those who have fair to poor general health is 19.061. Those who have fair to poor general health have 19.061 [15.196, 24.079] times the odds of experiencing frequent physical distress, as compared to those who have very good to excellent general health and holding all other variables constant. This is a significant association based on the confidence interval no including 1.0.
mod_no_income <- glm(physhlth_days_binary ~ exercise + menthlth_days + gen_health + bmi + sex + depression + education,
data = brfss_analytic_logistic, family = binomial)
anova(mod_no_income, mod_logistic, test = "Chisq") |>
kable(digits = 3,
caption = "LR Test for Exercise × Sleep Interaction") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
|---|---|---|---|---|
| 7989 | 4469.169 | NA | NA | NA |
| 7986 | 4459.261 | 3 | 9.909 | 0.019 |
The null hypothesis is that income has no impact on the model, whereas the alternative hypothesis states that income does improve the model’s ability to predict physical health distress.
The test statistic is 15.19 with 3 degrees of freedom and a significant p-value = 0.002. Therefore, we can reject the null hypothesis and conclude that income does improve the ability to predict physical health distress.
roc_obj <- roc(brfss_analytic_logistic$physhlth_days_binary, fitted(mod_logistic))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE)
auc(roc_obj)
## Area under the curve: 0.8548
ci.auc(roc_obj)
## 95% CI: 0.8415-0.868 (DeLong)
The AUC value is 0.8611 [0.849, 0.874]. The AUC value suggests that the model has excellent discrimination between individuals with an without frequent physical distress.
hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.2707, df = 8, p-value = 0.6169
The null hypothesis is that the model fits the data well and alternative hypothesis is that the model does not fit the data well. The test statistic is 10.585 with 8 degrees of freedom and a p-value of 0.2263. We can not reject the null hypothesis so can conclude that there is no evidence of poor model fit. While discrimination assesses how well the model separates events from non-events, ROC/AUC assesses how well predicted probabilities match observed rates.
The results of this analysis suggests that many sociodemographic factors are associated and predictive of frequent physical distress among adults in the U.S. The predictors with the strongest association were income, exercise, general health, depression, and education. Both linear and logistic models showed similar results among all variables.
There were limitations in this analysis. Cross-sectional data does not allow researchers to determine the direction of association between outcome variables and covariates. There was no inclusion of common diseases known to impact physical health and quality of life. Such diseases include cardiovascular disease, diabetes, cancer, etc. Participants with these health conditions will be more likely to experience frequent physical distress regardless of other variables controlled for in this analysis.
The linear regression model assesses physical health distress as a continuous outcome variable so our results must be analyzed based on additional or fewer days of physical health distress; whereas, the logistic regression model assesses our outcome as a binary variables with our results only determining the predictive ability of our covariates to determine the presence of frequent physical health distress. Logistic modeling would be preferable when there is a clear delineation between the two groups, or if previous research or current institutions use cutoffs to better describe a condition. Linear regression uses R-squared to determine the fit of the model, whereas logistic regression uses AUC to determine the predictive ability of the model to classify between the binary groups.
I did not use AI for any sections of this assignment.