Part 0: Data Preparation
# Load Packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
## Warning: package 'haven' was built under R version 4.5.2
library(broom)
## Warning: package 'broom' was built under R version 4.5.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.2
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(leaps)
## Warning: package 'leaps' was built under R version 4.5.2
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.5.3
## ResourceSelection 0.3-6 2023-06-27
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.2
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# Load BRFSS 2023 Data
brfss_full <- read_xpt(
"C:/Users/abbym/OneDrive/Desktop/STATS553/Assignments/Homework #4/LLCP2023.XPT"
) |>
clean_names()
brfss_clean <- brfss_full |>
# Recode variables
mutate(
# Outcome: mentally unhealthy days in past 30 (88 = none = 0; 77/99 = DK/refused = NA)
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
# Physical health days (key predictor)
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# Age category
age_group = factor(case_when(
ageg5yr == 1 ~ "18-34",
ageg5yr == 2 ~ "18-34",
ageg5yr == 3 ~ "18-34",
ageg5yr == 4 ~ "35-49",
ageg5yr == 5 ~ "35-49",
ageg5yr == 6 ~ "35-49",
ageg5yr == 7 ~ "50-64",
ageg5yr == 8 ~ "50-64",
ageg5yr == 9 ~ "50-64",
ageg5yr == 10 ~ "65+",
ageg5yr == 11 ~ "65+",
ageg5yr == 12 ~ "65+",
ageg5yr == 13 ~ "65+",
ageg5yr == 14 ~ NA
)),
# Income
income = factor(case_when(
incomg1 == 1 ~ "Less than $25K",
incomg1 == 2 ~ "Less than $25K",
incomg1 == 3 ~ "$25K-$49K",
incomg1 == 4 ~ "$25K-$49K",
incomg1 == 5 ~ "$50K-$99K",
incomg1 == 6 ~ "$50K-$99K",
incomg1 == 7 ~ "$100K+",
incomg1 == 9 ~ NA
)),
#Relevel Income
income = relevel(income, ref= "Less than $25K"),
# Education
education = factor(case_when(
educa == 1 ~ "Less than high school",
educa == 2 ~ "Less than high school",
educa == 3 ~ "Less than high school",
educa == 4 ~ "HS/GED",
educa == 5 ~ "Some College",
educa == 6 ~ "College Graduate",
educa == 9 ~ NA
)),
# Relevel education
education = relevel(education, ref= "Less than high school"),
# Smoker
smoking = factor(case_when(
smoker3 == 1 ~ "Current",
smoker3 == 2 ~ "Current",
smoker3 == 3 ~ "Former",
smoker3 == 4 ~ "Never",
smoker3 == 9 ~ NA
)),
# Relevel smoking
smoking = relevel(smoking, ref= "Never"),
# Sex
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
# Exercise in past 30 days (any physical activity)
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
# BMI (stored as integer × 100 in BRFSS)
bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_)
) |>
# Select Variables
dplyr::select(physhlth_days, menthlth_days, bmi, sex, exercise, age_group, income, education, smoking)
brfss_clean|>
select(physhlth_days, menthlth_days, bmi, sex, exercise, age_group, income, education, smoking) |>
summarise(
across(
everything(),
list(
n_missing = ~ sum(is.na(.) | . %in% c(77, 88, 99, 7777, 9999)),
pct_missing = ~ sum(is.na(.) | . %in% c(77, 88, 99, 7777, 9999)) / n() * 100
),
.names = "{.col}_{.fn}"
)
) |>
pivot_longer(
everything(),
names_to = c("Variable", ".value"),
names_pattern = "(.*)_(n_missing|pct_missing)"
) |>
mutate(pct_missing = round(pct_missing, 1)) |>
arrange(desc(pct_missing)) |>
kable(
caption = "Missing or Don't Know/Refused (%) — BRFSS full sample",
col.names = c("Variable", "N Missing / DK / Refused", "% Missing / DK / Refused")
) |>
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
)
| Variable | N Missing / DK / Refused | % Missing / DK / Refused |
|---|---|---|
| income | 86623 | 20.0 |
| bmi | 40535 | 9.4 |
| smoking | 23062 | 5.3 |
| physhlth_days | 10785 | 2.5 |
| menthlth_days | 8108 | 1.9 |
| age_group | 7779 | 1.8 |
| education | 2325 | 0.5 |
| exercise | 1251 | 0.3 |
| sex | 0 | 0.0 |
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na() |>
slice_sample(n = 8000)
# Report analytic sample size
nrow(brfss_analytic)
## [1] 8000
brfss_analytic |>
tbl_summary(
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "BMI (kg/m^2)",
age_group ~ "Age Group (years)",
income ~ "Household income",
exercise ~ "Any physical activity (past 30 days)",
education ~ "Education Level",
smoking ~ "Smoking status",
sex ~ "Sex"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
# add_overall() |>
bold_labels() |>
#italicize_levels() |>
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**")
| Characteristic | N | N = 8,0001 |
|---|---|---|
| Physically unhealthy days (past 30) | 8,000 | 4.4 (8.7) |
| Mentally unhealthy days (past 30) | 8,000 | 4.3 (8.1) |
| BMI (kg/m^2) | 8,000 | 28.7 (6.5) |
| Sex | 8,000 | |
| Male | 3,936 (49%) | |
| Female | 4,064 (51%) | |
| Any physical activity (past 30 days) | 8,000 | 6,240 (78%) |
| Age Group (years) | 8,000 | |
| 18-34 | 1,277 (16%) | |
| 35-49 | 1,665 (21%) | |
| 50-64 | 2,055 (26%) | |
| 65+ | 3,003 (38%) | |
| Household income | 8,000 | |
| Less than $25K | 1,048 (13%) | |
| $100K+ | 638 (8.0%) | |
| $25K-$49K | 1,938 (24%) | |
| $50K-$99K | 4,376 (55%) | |
| Education Level | 8,000 | |
| Less than high school | 376 (4.7%) | |
| College Graduate | 3,659 (46%) | |
| HS/GED | 1,827 (23%) | |
| Some College | 2,138 (27%) | |
| Smoking status | 8,000 | |
| Never | 4,812 (60%) | |
| Current | 926 (12%) | |
| Former | 2,262 (28%) | |
| 1 Mean (SD); n (%) | ||
saveRDS(brfss_analytic,
"C:/Users/abbym/OneDrive/Desktop/STATS553/Assignments/Homework #4/brfss_analytic.rds")
Part 1: Model Selection for Linear Regression
# Fit maximum model
mod_max <- lm(physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)
# Display summary
summary(mod_max)
##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.6671 -3.6770 -1.9783 0.6429 31.1695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.78494 0.65888 8.780 < 2e-16 ***
## menthlth_days 0.30399 0.01143 26.586 < 2e-16 ***
## bmi 0.04807 0.01399 3.436 0.000593 ***
## sexFemale 0.03086 0.17943 0.172 0.863456
## exerciseYes -3.44946 0.22542 -15.302 < 2e-16 ***
## age_group35-49 1.34457 0.29864 4.502 6.82e-06 ***
## age_group50-64 2.64207 0.28757 9.188 < 2e-16 ***
## age_group65+ 2.87902 0.27487 10.474 < 2e-16 ***
## income$100K+ -3.81913 0.43275 -8.825 < 2e-16 ***
## income$25K-$49K -2.62287 0.30737 -8.533 < 2e-16 ***
## income$50K-$99K -3.23452 0.29614 -10.922 < 2e-16 ***
## educationCollege Graduate -1.23624 0.45384 -2.724 0.006464 **
## educationHS/GED -1.19956 0.45099 -2.660 0.007834 **
## educationSome College -0.72133 0.45299 -1.592 0.111339
## smokingCurrent 1.15379 0.29713 3.883 0.000104 ***
## smokingFormer 0.67961 0.20590 3.301 0.000968 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.859 on 7984 degrees of freedom
## Multiple R-squared: 0.1844, Adjusted R-squared: 0.1829
## F-statistic: 120.4 on 15 and 7984 DF, p-value: < 2.2e-16
# Display AIC and BIC
AIC(mod_max)
## [1] 55707.42
BIC(mod_max)
## [1] 55826.2
Interpretation: The r-squared is 0.1849, meaning that the predictors in the model can explain 18.49% of the variance in physically unhealthy days. The adjusted r-squared is 0.1833, meaning that the predictors in the model can explain 18.33% of the variance in physically unhealthy days, while being penalized for the amount of predictors in the model. The AIC is 55704.55 and the BIC is 55830.32. The AIC and BIC are helpful for comparing simpler models to the maximum model.
# Prepare a model matrix (need numeric predictors for leaps)
# Use the formula interface approach
best_subsets <- regsubsets(
physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + income + education + smoking,
data = brfss_analytic,
nvmax = 17, # maximum number of variables to consider
method = "exhaustive"
)
best_summary <- summary(best_subsets)
subset_metrics <- tibble(
p = 1:length(best_summary$adjr2),
`Adj. R²` = best_summary$adjr2,
BIC = best_summary$bic,
Cp = best_summary$cp
)
p1 <- ggplot(subset_metrics, aes(x = p, y = `Adj. R²`)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.max(best_summary$adjr2),
linetype = "dashed", color = "tomato") +
labs(title = "Adjusted R² by Model Size", x = "Number of Variables", y = "Adjusted R²") +
theme_minimal(base_size = 12)
p2 <- ggplot(subset_metrics, aes(x = p, y = BIC)) +
geom_line(linewidth = 1, color = "steelblue") +
geom_point(size = 3, color = "steelblue") +
geom_vline(xintercept = which.min(best_summary$bic),
linetype = "dashed", color = "tomato") +
labs(title = "BIC by Model Size", x = "Number of Variables", y = "BIC") +
theme_minimal(base_size = 12)
gridExtra::grid.arrange(p1, p2, ncol = 2)
Interpretation: A model size of 14 predictors maximizes
the adjusted r-squared, and a model size of 11 predictors minimizes the
BIC. The two criteria do not agree on a best model size. The BIC favors
a simpler model.
# Automated backward elimination using AIC
mod_backward <- step(mod_max, direction = "backward", trace = 1)
## Start: AIC=33002.4
## physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
##
## Df Sum of Sq RSS AIC
## - sex 1 2 493117 33000
## <none> 493115 33002
## - education 3 747 493862 33009
## - bmi 1 729 493844 33012
## - smoking 2 1293 494408 33019
## - income 3 7823 500938 33122
## - age_group 3 8240 501355 33129
## - exercise 1 14462 507577 33232
## - menthlth_days 1 43656 536771 33679
##
## Step: AIC=33000.43
## physhlth_days ~ menthlth_days + bmi + exercise + age_group +
## income + education + smoking
##
## Df Sum of Sq RSS AIC
## <none> 493117 33000
## - education 3 745 493862 33007
## - bmi 1 728 493845 33010
## - smoking 2 1295 494412 33017
## - income 3 7909 501026 33122
## - age_group 3 8302 501418 33128
## - exercise 1 14480 507597 33230
## - menthlth_days 1 44141 537257 33684
# Table
tidy(mod_backward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Backward Elimination Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.8000 | 0.6530 | 8.8821 | 0.0000 | 4.5200 | 7.0800 |
| menthlth_days | 0.3042 | 0.0114 | 26.7352 | 0.0000 | 0.2819 | 0.3265 |
| bmi | 0.0480 | 0.0140 | 3.4339 | 0.0006 | 0.0206 | 0.0754 |
| exerciseYes | -3.4505 | 0.2253 | -15.3127 | 0.0000 | -3.8922 | -3.0088 |
| age_group35-49 | 1.3477 | 0.2981 | 4.5215 | 0.0000 | 0.7634 | 1.9320 |
| age_group50-64 | 2.6452 | 0.2870 | 9.2171 | 0.0000 | 2.0826 | 3.2077 |
| age_group65+ | 2.8828 | 0.2740 | 10.5221 | 0.0000 | 2.3457 | 3.4198 |
| income$100K+ | -3.8271 | 0.4303 | -8.8949 | 0.0000 | -4.6705 | -2.9837 |
| income$25K-$49K | -2.6241 | 0.3073 | -8.5398 | 0.0000 | -3.2264 | -2.0217 |
| income$50K-$99K | -3.2385 | 0.2952 | -10.9701 | 0.0000 | -3.8172 | -2.6598 |
| educationCollege Graduate | -1.2319 | 0.4531 | -2.7188 | 0.0066 | -2.1201 | -0.3437 |
| educationHS/GED | -1.1978 | 0.4508 | -2.6567 | 0.0079 | -2.0815 | -0.3140 |
| educationSome College | -0.7178 | 0.4525 | -1.5863 | 0.1127 | -1.6048 | 0.1692 |
| smokingCurrent | 1.1503 | 0.2964 | 3.8807 | 0.0001 | 0.5692 | 1.7313 |
| smokingFormer | 0.6768 | 0.2052 | 3.2977 | 0.0010 | 0.2745 | 1.0791 |
# Automated forward selection using AIC
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)
mod_forward <- step(mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "forward", trace = 1)
## Start: AIC=34603.45
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 59071 545563 33783
## + exercise 1 37706 566927 34090
## + income 3 32443 572191 34168
## + smoking 2 12018 592616 34447
## + education 3 11873 592761 34451
## + age_group 3 5515 599119 34536
## + bmi 1 4732 599901 34543
## + sex 1 1559 603075 34585
## <none> 604634 34603
##
## Step: AIC=33783.01
## physhlth_days ~ menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 28346.8 517216 33358
## + income 3 19686.5 525876 33495
## + age_group 3 14455.2 531108 33574
## + education 3 8240.9 537322 33667
## + smoking 2 6070.4 539492 33697
## + bmi 1 2821.0 542742 33744
## + sex 1 266.4 545296 33781
## <none> 545563 33783
##
## Step: AIC=33358.15
## physhlth_days ~ menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + income 3 11764.1 505452 33180
## + age_group 3 9921.3 507295 33209
## + smoking 2 3952.9 513263 33301
## + education 3 3409.0 513807 33311
## + bmi 1 753.5 516463 33348
## <none> 517216 33358
## + sex 1 106.7 517109 33359
##
## Step: AIC=33180.09
## physhlth_days ~ menthlth_days + exercise + income
##
## Df Sum of Sq RSS AIC
## + age_group 3 9400.2 496052 33036
## + smoking 2 2483.5 502968 33145
## + bmi 1 781.6 504670 33170
## + education 3 937.1 504515 33171
## <none> 505452 33180
## + sex 1 6.7 505445 33182
##
## Step: AIC=33035.91
## physhlth_days ~ menthlth_days + exercise + income + age_group
##
## Df Sum of Sq RSS AIC
## + smoking 2 1427.13 494625 33017
## + education 3 1013.78 495038 33026
## + bmi 1 666.76 495385 33027
## <none> 496052 33036
## + sex 1 14.93 496037 33038
##
## Step: AIC=33016.86
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking
##
## Df Sum of Sq RSS AIC
## + bmi 1 762.78 493862 33007
## + education 3 779.94 493845 33010
## <none> 494625 33017
## + sex 1 0.00 494625 33019
##
## Step: AIC=33006.52
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking + bmi
##
## Df Sum of Sq RSS AIC
## + education 3 745.36 493117 33000
## <none> 493862 33007
## + sex 1 0.35 493862 33009
##
## Step: AIC=33000.43
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking + bmi + education
##
## Df Sum of Sq RSS AIC
## <none> 493117 33000
## + sex 1 1.8268 493115 33002
# Table
tidy(mod_forward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Forward Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.8000 | 0.6530 | 8.8821 | 0.0000 | 4.5200 | 7.0800 |
| menthlth_days | 0.3042 | 0.0114 | 26.7352 | 0.0000 | 0.2819 | 0.3265 |
| exerciseYes | -3.4505 | 0.2253 | -15.3127 | 0.0000 | -3.8922 | -3.0088 |
| income$100K+ | -3.8271 | 0.4303 | -8.8949 | 0.0000 | -4.6705 | -2.9837 |
| income$25K-$49K | -2.6241 | 0.3073 | -8.5398 | 0.0000 | -3.2264 | -2.0217 |
| income$50K-$99K | -3.2385 | 0.2952 | -10.9701 | 0.0000 | -3.8172 | -2.6598 |
| age_group35-49 | 1.3477 | 0.2981 | 4.5215 | 0.0000 | 0.7634 | 1.9320 |
| age_group50-64 | 2.6452 | 0.2870 | 9.2171 | 0.0000 | 2.0826 | 3.2077 |
| age_group65+ | 2.8828 | 0.2740 | 10.5221 | 0.0000 | 2.3457 | 3.4198 |
| smokingCurrent | 1.1503 | 0.2964 | 3.8807 | 0.0001 | 0.5692 | 1.7313 |
| smokingFormer | 0.6768 | 0.2052 | 3.2977 | 0.0010 | 0.2745 | 1.0791 |
| bmi | 0.0480 | 0.0140 | 3.4339 | 0.0006 | 0.0206 | 0.0754 |
| educationCollege Graduate | -1.2319 | 0.4531 | -2.7188 | 0.0066 | -2.1201 | -0.3437 |
| educationHS/GED | -1.1978 | 0.4508 | -2.6567 | 0.0079 | -2.0815 | -0.3140 |
| educationSome College | -0.7178 | 0.4525 | -1.5863 | 0.1127 | -1.6048 | 0.1692 |
mod_stepwise <- step(mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "both", trace = 1)
## Start: AIC=34603.45
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 59071 545563 33783
## + exercise 1 37706 566927 34090
## + income 3 32443 572191 34168
## + smoking 2 12018 592616 34447
## + education 3 11873 592761 34451
## + age_group 3 5515 599119 34536
## + bmi 1 4732 599901 34543
## + sex 1 1559 603075 34585
## <none> 604634 34603
##
## Step: AIC=33783.01
## physhlth_days ~ menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 28347 517216 33358
## + income 3 19686 525876 33495
## + age_group 3 14455 531108 33574
## + education 3 8241 537322 33667
## + smoking 2 6070 539492 33697
## + bmi 1 2821 542742 33744
## + sex 1 266 545296 33781
## <none> 545563 33783
## - menthlth_days 1 59071 604634 34603
##
## Step: AIC=33358.15
## physhlth_days ~ menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + income 3 11764 505452 33180
## + age_group 3 9921 507295 33209
## + smoking 2 3953 513263 33301
## + education 3 3409 513807 33311
## + bmi 1 754 516463 33348
## <none> 517216 33358
## + sex 1 107 517109 33359
## - exercise 1 28347 545563 33783
## - menthlth_days 1 49711 566927 34090
##
## Step: AIC=33180.09
## physhlth_days ~ menthlth_days + exercise + income
##
## Df Sum of Sq RSS AIC
## + age_group 3 9400 496052 33036
## + smoking 2 2484 502968 33145
## + bmi 1 782 504670 33170
## + education 3 937 504515 33171
## <none> 505452 33180
## + sex 1 7 505445 33182
## - income 3 11764 517216 33358
## - exercise 1 20424 525876 33495
## - menthlth_days 1 41532 546984 33810
##
## Step: AIC=33035.91
## physhlth_days ~ menthlth_days + exercise + income + age_group
##
## Df Sum of Sq RSS AIC
## + smoking 2 1427 494625 33017
## + education 3 1014 495038 33026
## + bmi 1 667 495385 33027
## <none> 496052 33036
## + sex 1 15 496037 33038
## - age_group 3 9400 505452 33180
## - income 3 11243 507295 33209
## - exercise 1 17199 513251 33307
## - menthlth_days 1 47177 543229 33761
##
## Step: AIC=33016.86
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking
##
## Df Sum of Sq RSS AIC
## + bmi 1 763 493862 33007
## + education 3 780 493845 33010
## <none> 494625 33017
## + sex 1 0 494625 33019
## - smoking 2 1427 496052 33036
## - age_group 3 8344 502968 33145
## - income 3 9897 504522 33169
## - exercise 1 16705 511329 33281
## - menthlth_days 1 44789 539414 33708
##
## Step: AIC=33006.52
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking + bmi
##
## Df Sum of Sq RSS AIC
## + education 3 745 493117 33000
## <none> 493862 33007
## + sex 1 0 493862 33009
## - bmi 1 763 494625 33017
## - smoking 2 1523 495385 33027
## - age_group 3 8245 502107 33133
## - income 3 9781 503643 33157
## - exercise 1 15215 509077 33247
## - menthlth_days 1 44206 538068 33690
##
## Step: AIC=33000.43
## physhlth_days ~ menthlth_days + exercise + income + age_group +
## smoking + bmi + education
##
## Df Sum of Sq RSS AIC
## <none> 493117 33000
## + sex 1 2 493115 33002
## - education 3 745 493862 33007
## - bmi 1 728 493845 33010
## - smoking 2 1295 494412 33017
## - income 3 7909 501026 33122
## - age_group 3 8302 501418 33128
## - exercise 1 14480 507597 33230
## - menthlth_days 1 44141 537257 33684
# Table
tidy(mod_stepwise, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Stepwise Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.8000 | 0.6530 | 8.8821 | 0.0000 | 4.5200 | 7.0800 |
| menthlth_days | 0.3042 | 0.0114 | 26.7352 | 0.0000 | 0.2819 | 0.3265 |
| exerciseYes | -3.4505 | 0.2253 | -15.3127 | 0.0000 | -3.8922 | -3.0088 |
| income$100K+ | -3.8271 | 0.4303 | -8.8949 | 0.0000 | -4.6705 | -2.9837 |
| income$25K-$49K | -2.6241 | 0.3073 | -8.5398 | 0.0000 | -3.2264 | -2.0217 |
| income$50K-$99K | -3.2385 | 0.2952 | -10.9701 | 0.0000 | -3.8172 | -2.6598 |
| age_group35-49 | 1.3477 | 0.2981 | 4.5215 | 0.0000 | 0.7634 | 1.9320 |
| age_group50-64 | 2.6452 | 0.2870 | 9.2171 | 0.0000 | 2.0826 | 3.2077 |
| age_group65+ | 2.8828 | 0.2740 | 10.5221 | 0.0000 | 2.3457 | 3.4198 |
| smokingCurrent | 1.1503 | 0.2964 | 3.8807 | 0.0001 | 0.5692 | 1.7313 |
| smokingFormer | 0.6768 | 0.2052 | 3.2977 | 0.0010 | 0.2745 | 1.0791 |
| bmi | 0.0480 | 0.0140 | 3.4339 | 0.0006 | 0.0206 | 0.0754 |
| educationCollege Graduate | -1.2319 | 0.4531 | -2.7188 | 0.0066 | -2.1201 | -0.3437 |
| educationHS/GED | -1.1978 | 0.4508 | -2.6567 | 0.0079 | -2.0815 | -0.3140 |
| educationSome College | -0.7178 | 0.4525 | -1.5863 | 0.1127 | -1.6048 | 0.1692 |
Interpretation: The three methods all agreed on the same model. All of the methods got rid of the sex predictor. Mentally unhealthy days, exercise, income, age group, smoking status, BMI, and education level were all preserved by all three methods of model selection.
tidy(mod_stepwise, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Stepwise Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.8000 | 0.6530 | 8.8821 | 0.0000 | 4.5200 | 7.0800 |
| menthlth_days | 0.3042 | 0.0114 | 26.7352 | 0.0000 | 0.2819 | 0.3265 |
| exerciseYes | -3.4505 | 0.2253 | -15.3127 | 0.0000 | -3.8922 | -3.0088 |
| income$100K+ | -3.8271 | 0.4303 | -8.8949 | 0.0000 | -4.6705 | -2.9837 |
| income$25K-$49K | -2.6241 | 0.3073 | -8.5398 | 0.0000 | -3.2264 | -2.0217 |
| income$50K-$99K | -3.2385 | 0.2952 | -10.9701 | 0.0000 | -3.8172 | -2.6598 |
| age_group35-49 | 1.3477 | 0.2981 | 4.5215 | 0.0000 | 0.7634 | 1.9320 |
| age_group50-64 | 2.6452 | 0.2870 | 9.2171 | 0.0000 | 2.0826 | 3.2077 |
| age_group65+ | 2.8828 | 0.2740 | 10.5221 | 0.0000 | 2.3457 | 3.4198 |
| smokingCurrent | 1.1503 | 0.2964 | 3.8807 | 0.0001 | 0.5692 | 1.7313 |
| smokingFormer | 0.6768 | 0.2052 | 3.2977 | 0.0010 | 0.2745 | 1.0791 |
| bmi | 0.0480 | 0.0140 | 3.4339 | 0.0006 | 0.0206 | 0.0754 |
| educationCollege Graduate | -1.2319 | 0.4531 | -2.7188 | 0.0066 | -2.1201 | -0.3437 |
| educationHS/GED | -1.1978 | 0.4508 | -2.6567 | 0.0079 | -2.0815 | -0.3140 |
| educationSome College | -0.7178 | 0.4525 | -1.5863 | 0.1127 | -1.6048 | 0.1692 |
Interpretation: Since all three methods of model selection chose the same model, I am choosing the model all three created for my final model. For every one unit increase in mentally unhealthy days, physically unhealthy days will increase by 0.3042 days, holding all other variables constant. For every one unit increase in BMI, physically unhealthy days will increase by 0.0480 days, holding all other variables constant. Compared those who don’t exercise, those who do exercise have 3.4505 fewer physically unhealthy days, holding all other variables constant.
par(mfrow = c(2, 2))
plot(mod_stepwise)
par(mfrow = c(1, 1))
Interpretation: Residuals vs. Fitted: The red line is not exactly horizontal, meaning the data aren’t perfectly linear. In addition, there may be some evidence of heteroscedasticity, as the data don’t seem to be exactly randomly scattered. Q-Q Residuals: There are deviations from the normality line, meaning the data aren’t exactly normally distributed. The largest deviation is on the right side of the tail. Scale-Location: There is large evidence of heteroscedasticity here, as the red line is not horizontal. The variances are not equal. Residuals vs Leverage: There are many observations beyond Cook’s distance, meaning there are potentially highly influential observations within the dataset.
The LINE assumptions are violated in many ways. However, the sample size is so large these deviations may not cause issues throughout the analysis. There is evidence that the data is not linear, the variances aren’t equally, it isn’t normally distributed, and that independence is violated.
Part 2: Logistic Regression
brfss_log <- brfss_analytic |>
mutate(fpd = factor(
ifelse(physhlth_days >= 14, 1, 0),
levels = c(0, 1),
labels = c("No", "Yes")
))
brfss_log %>%
select(fpd) %>%
tbl_summary(
type = list(fpd ~ "categorical"),
label = list(
fpd ~ "Frequent Physical Distress"
),
statistic = list(
all_categorical() ~ "{n} ({p}%)"
)) %>%
add_n() %>%
bold_labels() %>%
modify_caption("**Frequent Physical Distress Descriptive Statistics**")
| Characteristic | N | N = 8,0001 |
|---|---|---|
| Frequent Physical Distress | 8,000 | |
| No | 6,914 (86%) | |
| Yes | 1,086 (14%) | |
| 1 n (%) | ||
Interpretation: There are many more people who don’t experience frequent physical distress than people who do. 86% of people in the dataset do not experience it, and 14% of people do.
mod_simple <- glm(fpd ~ bmi,
data = brfss_log, family = binomial)
# Model Summary
summary(mod_simple)
##
## Call:
## glm(formula = fpd ~ bmi, family = binomial, data = brfss_log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.771987 0.141013 -19.658 < 2e-16 ***
## bmi 0.031542 0.004619 6.829 8.57e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6354.8 on 7999 degrees of freedom
## Residual deviance: 6310.3 on 7998 degrees of freedom
## AIC: 6314.3
##
## Number of Fisher Scoring iterations: 4
exp(coef(mod_simple)) # Odds ratio
## (Intercept) bmi
## 0.06253761 1.03204520
exp(confint(mod_simple)) # 95% CI for OR
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.04743404 0.08246141
## bmi 1.02269073 1.04138719
Interpretation: For every one-unit increase in BMI, the odds of frequent physical distress are multiplied by 1.032, 95% CI [1.023, 1.041]. The association is statistically significant, as the p-value from the Wald test is 0, which is less than 0.05, meaning you can reject the null hypothesis that there is no association between BMI and frequent physical distress.
mod_logistic <- glm(fpd ~ menthlth_days + exercise + income + age_group + smoking + bmi + education,
data = brfss_log, family = binomial)
summary(mod_logistic)
##
## Call:
## glm(formula = fpd ~ menthlth_days + exercise + income + age_group +
## smoking + bmi + education, family = binomial, data = brfss_log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.249364 0.241714 -9.306 < 2e-16 ***
## menthlth_days 0.068264 0.003648 18.712 < 2e-16 ***
## exerciseYes -0.885478 0.076964 -11.505 < 2e-16 ***
## income$100K+ -1.502512 0.229493 -6.547 5.87e-11 ***
## income$25K-$49K -0.590324 0.101531 -5.814 6.09e-09 ***
## income$50K-$99K -0.855241 0.100385 -8.520 < 2e-16 ***
## age_group35-49 0.763898 0.147572 5.176 2.26e-07 ***
## age_group50-64 1.214414 0.139734 8.691 < 2e-16 ***
## age_group65+ 1.293078 0.136943 9.442 < 2e-16 ***
## smokingCurrent 0.357308 0.105325 3.392 0.000693 ***
## smokingFormer 0.294754 0.081332 3.624 0.000290 ***
## bmi 0.014360 0.005155 2.786 0.005344 **
## educationCollege Graduate -0.493111 0.154804 -3.185 0.001446 **
## educationHS/GED -0.335329 0.148966 -2.251 0.024383 *
## educationSome College -0.146800 0.149691 -0.981 0.326745
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6354.8 on 7999 degrees of freedom
## Residual deviance: 5280.6 on 7985 degrees of freedom
## AIC: 5310.6
##
## Number of Fisher Scoring iterations: 6
# Table
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Logistic Regression: FMD ~ menthlth_days + exercise + income + age_group + smoking + bmi + education (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.105 | 0.242 | -9.306 | 0.000 | 0.065 | 0.169 |
| menthlth_days | 1.071 | 0.004 | 18.712 | 0.000 | 1.063 | 1.078 |
| exerciseYes | 0.413 | 0.077 | -11.505 | 0.000 | 0.355 | 0.480 |
| income$100K+ | 0.223 | 0.229 | -6.547 | 0.000 | 0.139 | 0.343 |
| income$25K-$49K | 0.554 | 0.102 | -5.814 | 0.000 | 0.454 | 0.676 |
| income$50K-$99K | 0.425 | 0.100 | -8.520 | 0.000 | 0.349 | 0.518 |
| age_group35-49 | 2.147 | 0.148 | 5.176 | 0.000 | 1.613 | 2.878 |
| age_group50-64 | 3.368 | 0.140 | 8.691 | 0.000 | 2.573 | 4.452 |
| age_group65+ | 3.644 | 0.137 | 9.442 | 0.000 | 2.800 | 4.792 |
| smokingCurrent | 1.429 | 0.105 | 3.392 | 0.001 | 1.161 | 1.755 |
| smokingFormer | 1.343 | 0.081 | 3.624 | 0.000 | 1.144 | 1.574 |
| bmi | 1.014 | 0.005 | 2.786 | 0.005 | 1.004 | 1.025 |
| educationCollege Graduate | 0.611 | 0.155 | -3.185 | 0.001 | 0.452 | 0.830 |
| educationHS/GED | 0.715 | 0.149 | -2.251 | 0.024 | 0.535 | 0.961 |
| educationSome College | 0.863 | 0.150 | -0.981 | 0.327 | 0.646 | 1.162 |
Interpretation: For everyone 1 unit increase in mentally unhealthy days, the odds of frequent physical distress are multiplied by 1.071, holding all other variables constant. For every 1 unit increase in bmi, the odds of frequent physical distress are multiplied by 1.014, holding all other variables constant. Compared to those who don’t exercise, those who do have 58.7% decreased odds of having frequent physical distress, holding all other variables constant.
mod_reduced <- glm(fpd ~ menthlth_days + exercise + age_group + smoking + bmi + education,
data = brfss_log, family = binomial)
# Liklihood Ratio Test
anova(mod_reduced, mod_logistic, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: fpd ~ menthlth_days + exercise + age_group + smoking + bmi +
## education
## Model 2: fpd ~ menthlth_days + exercise + income + age_group + smoking +
## bmi + education
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7988 5367.9
## 2 7985 5280.6 3 87.353 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation: The null hypothesis of the LRT is that income has no effect on the model predicting frequent physical distress, and the alternative hypothesis is that income does have an effect on the model predicting frequent physical distress. The test statistic is 87.353, the degrees of freedom are 3, and the p-value is < 2.2e-16. Since the p-value is less than 0.05, you reject the null hypothesis, meaning income does help to predict frequent mental distress.
roc_obj <- roc(brfss_log$fpd, fitted(mod_logistic))
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE)
auc(roc_obj)
## Area under the curve: 0.7773
ci.auc(roc_obj)
## 95% CI: 0.7618-0.7927 (DeLong)
Interpretation: The AUC of the model is 0.777, meaning the model has acceptable discrimination between those with frequent physical distress and those without frequent physical distress.
hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 10.713, df = 8, p-value = 0.2185
Interpretation: The null hypothesis is that the model fits the data well and the alternative hypothesis is that the model does not fit the data well. The t-statistic is 10.713, the degrees of freedom are 8, and the p-value is 0.2185. Since the p-value is greater than 0.05, you fail to reject the null hypothesis that the model doesn’t fit the data well. The model is not a poor fit for the data. The Hosmer-Lemeshow test complements the ROC/AUC findings well, because it shows that the model fits the data well (there’s agreement between predicted and observed events), but that it can also discriminate between “cases” and “non-cases”. It is good to test both because models can have good discrimination and poor fit, or vice versa.
Part 3: Reflection There are many factors that play a role in the physical health burden among US adults. These factors include mentally unhealthy days, sex, exercise status, age group, income, education level, BMI, smoking status, and I’m sure others, as well. It would be difficult to perfectly predict whether a person has frequent physical distress, as there are so many factors that play a role in determining it. Age group and income had the strongest association in both the linear and logistic models. All of the predictors that were significant in the linear model were also significant in the logistic model. EducationSomeCollege was insignificant in both the linear and logistic model. The key limitations of using cross-sectional survey data is not being able to establish temporality. Because the study is cross-sectional, all of the data is collected at the same time, so you can’t establish whether one factor caused another factor because you don’t know which one came first. Two potential confounders that were not measured in the model could be whether a person has a condition like Multiple Sclerosis, which could impact their physical ability, and a variable such as the general health of a person. The linear regression gives us a tangible number of physically unhealthy days that each person of a certain characteristic is predicted to have, and logistic regression gives you the odds of a person having frequent physical distress based on a certain characteristic they possess. Linear regression gives us a predicted number of units that a person will possess for a certain characteristic and logistic regression gives us a predicted odds of developing/having a certain condition.You would prefer linear regression when you have a continuous outcome, and logistic regression when you have a binary outcome. Linear regression is better when you want a specifically predicted number a person will have, and logistic regression is better than you want more of a predicted probability/odds. R-squared is the percentage of variability in the outcome that can be explained by the model. AUC determines how well a model can discriminate between a “case” and “non-case”. Both are based on the outcome variable and how well the model can help to predict the outcome, but since the outcome variables for models are on different scales, the criteria and measures of model selection are different and measure different things. I did not use any AI for this assignment.