knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.width = 10,
fig.height = 6
)library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(leaps)
library(gtsummary)
library(pROC)
library(ResourceSelection)
# Load additional packages
library(janitor)
library(dplyr)
library(knitr)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))brfss_data <- read_xpt(
"/Users/morganwheat/Desktop/R Project/LLCP2023.XPT 2"
) |>
clean_names()
# Report the number of columns and rows in the dataset
tibble(Metric = c("Observations", "Variables"),
Value = c(nrow(brfss_data), ncol(brfss_data))) |>
kable(caption = "Raw Dataset Dimensions") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Observations | 433323 |
| Variables | 350 |
# Select key outcome = `physhlth`
# Selected continuous variables = `menthlth` and `bmi5`
# Selected binary variables = `sexvar` and `exerany2`
# Selected categorical variables = `ageg5yr`, `educa`, `genhlth`, and `marital`
# Select the nine variables
brfss_data <- brfss_data |>
dplyr::select(physhlth, menthlth, bmi5, sexvar, exerany2, ageg5yr, educa, genhlth, marital)
# Display the results
tibble(Metric = c("Observations", "Variables"),
Value = c(nrow(brfss_data), ncol(brfss_data))) |>
kable(caption = "Selected Dataset Dimensions") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Observations | 433323 |
| Variables | 9 |
I had selected the variable Physically Unhealthy Days (Past 30 Days) since this was the selected outcome of the assignment. Additionally, in order to follow the guidelines of this assignment, Mentally Unhealthy Days (Past 30 Days) and BMI were the required continuous variables. Finally, I had selected sex and exercise as my binary variables and education, general health status, and marital status as my categorical variables purely based on my own interests to see how they influenced the outcome.
brfss_recode_new <- brfss_data |>
mutate(
# Physically unhealthy days (past 30)
# Outcome
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# Binary outcome: frequent physical distress (>= 14 days)
fpd = factor(
case_when(
physhlth_days >= 14 ~ 1,
physhlth_days < 14 ~ 0,
TRUE ~ NA_real_
),
levels = c(0, 1),
labels = c("No", "Yes")
),
# Mentally unhealthy days (past 30)
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth %in% c(77, 99) ~ NA_real_,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
# Body Mass Index x 100 (divide by 100)
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
TRUE ~ bmi5 / 100
),
# Sex (1 = Male, 2 = Female)
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
# Any Exercise in the past 30 days (1 = Yes, 2 = No)
exercise = factor(
case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
),
levels = c("No", "Yes")
),
# Age Group
age_group = factor(
case_when(
ageg5yr %in% c(1, 2, 3) ~ "18-34",
ageg5yr %in% c(4, 5, 6) ~ "35-49",
ageg5yr %in% c(7, 8, 9) ~ "50-64",
ageg5yr %in% c(10, 11, 12, 13) ~ "65+",
ageg5yr == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("18-34", "35-49", "50-64", "65+")
),
# Highest Level of Education Completed
education = factor(
case_when(
educa %in% c(1, 2, 3) ~ "Less than high school",
educa == 4 ~ "HS/GED",
educa == 5 ~ "Some college",
educa == 6 ~ "College graduate",
educa == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than high school", "HS/GED", "Some college", "College graduate")
),
# General Health Status
gen_health = factor(
case_when(
genhlth %in% c(1, 2) ~ "Excellent/VG",
genhlth == 3 ~ "Good",
genhlth %in% c(4, 5) ~ "Fair/Poor",
TRUE ~ NA_character_
),
levels = c("Excellent/VG", "Good", "Fair/Poor")
),
# Marital Status
marital = factor(
case_when(
marital %in% c(1, 6) ~ "Married/Partnered",
marital %in% c(2, 3, 4) ~ "Previously married",
marital == 5 ~ "Never married",
marital == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Married/Partnered", "Previously married", "Never married")
)
)
# Report the number and percentage of missing observations for physhlth_days and at least two other key variables, before removing any missing values.
vars <- c("physhlth_days", "menthlth_days", "bmi", "sex", "exercise", "age_group", "education", "gen_health", "marital")
missing_summary <- data.frame(
Variable = vars,
missing_count = colSums(is.na(brfss_recode_new[vars])),
missing_percent = round(
colSums(is.na(brfss_recode_new[vars])) / nrow(brfss_recode_new) * 100, 2
)
)
missing_summary_sorted <- missing_summary[order(missing_summary$missing_count), ]
print(missing_summary_sorted)## Variable missing_count missing_percent
## sex sex 0 0.00
## exercise exercise 1251 0.29
## gen_health gen_health 1262 0.29
## education education 2325 0.54
## marital marital 4289 0.99
## age_group age_group 7779 1.80
## menthlth_days menthlth_days 8108 1.87
## physhlth_days physhlth_days 10785 2.49
## bmi bmi 40535 9.35
# Remove all rows with any missing values uding drop_na()
# Draw a random sample of 8,000 observations
set.seed(1220)
brfss_analytics <- brfss_recode_new |>
drop_na() |>
slice_sample(n = 8000)
# Report the final analytic sample size
tibble(Metric = c("Observations", "Variables"),
Value = c(nrow(brfss_analytics), ncol(brfss_analytics))) |>
kable(caption = "Analytic Dataset Dimensions") |>
kable_styling()| Metric | Value |
|---|---|
| Observations | 8000 |
| Variables | 18 |
gtsummary::tbl_summary(
brfss_analytics,
label = list(
physhlth_days ~ "Physically Unhealthy Days (Past 30 Days)",
menthlth_days ~ "Mentally Unhealthy Days (Past 30 Days)",
bmi ~ "Body Mass Index (BMI)",
sex ~ "Sex",
exercise ~ "Exercise in Past 30 Days",
age_group ~ "Age Group",
education ~ "Highest Level of Education",
gen_health ~ "General Health Status",
marital ~ "Marital Status"
)
)| Characteristic | N = 8,0001 |
|---|---|
| NUMBER OF DAYS PHYSICAL HEALTH NOT GOOD | 88 (10, 88) |
| NUMBER OF DAYS MENTAL HEALTH NOT GOOD | 88 (10, 88) |
| COMPUTED BODY MASS INDEX | 2,741 (2,413, 3,158) |
| SEX OF RESPONDENT | |
| 1 | 3,899 (49%) |
| 2 | 4,101 (51%) |
| EXERCISE IN PAST 30 DAYS | |
| 1 | 6,120 (77%) |
| 2 | 1,880 (24%) |
| REPORTED AGE IN FIVE-YEAR AGE CATEGORIES | 8.0 (5.0, 11.0) |
| EDUCATION LEVEL | |
| 1 | 7 (<0.1%) |
| 2 | 146 (1.8%) |
| 3 | 282 (3.5%) |
| 4 | 1,988 (25%) |
| 5 | 2,097 (26%) |
| 6 | 3,480 (44%) |
| GENERAL HEALTH | |
| 1 | 1,222 (15%) |
| 2 | 2,732 (34%) |
| 3 | 2,576 (32%) |
| 4 | 1,126 (14%) |
| 5 | 344 (4.3%) |
| Marital Status | |
| Married/Partnered | 4,533 (57%) |
| Previously married | 1,995 (25%) |
| Never married | 1,472 (18%) |
| Physically Unhealthy Days (Past 30 Days) | 0 (0, 4) |
| fpd | 1,072 (13%) |
| Mentally Unhealthy Days (Past 30 Days) | 0 (0, 5) |
| Body Mass Index (BMI) | 27 (24, 32) |
| Sex | |
| Male | 3,899 (49%) |
| Female | 4,101 (51%) |
| Exercise in Past 30 Days | 6,120 (77%) |
| Age Group | |
| 18-34 | 1,355 (17%) |
| 35-49 | 1,522 (19%) |
| 50-64 | 2,108 (26%) |
| 65+ | 3,015 (38%) |
| Highest Level of Education | |
| Less than high school | 435 (5.4%) |
| HS/GED | 1,988 (25%) |
| Some college | 2,097 (26%) |
| College graduate | 3,480 (44%) |
| General Health Status | |
| Excellent/VG | 3,954 (49%) |
| Good | 2,576 (32%) |
| Fair/Poor | 1,470 (18%) |
| 1 Median (Q1, Q3); n (%) | |
# Fit the maximum logistics model that contains all of my selected predictors
max_mod <- lm(physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + education + gen_health + marital,
data = brfss_analytics)
# Display the model Summary
tidy(max_mod, conf.int = TRUE) |>
kable(digits = 3, caption = "Working Model Coefficients") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 1.335 | 0.589 | 2.268 | 0.023 | 0.181 | 2.489 |
| menthlth_days | 0.205 | 0.011 | 19.274 | 0.000 | 0.184 | 0.226 |
| bmi | 0.022 | 0.013 | 1.712 | 0.087 | -0.003 | 0.046 |
| sexFemale | 0.070 | 0.165 | 0.424 | 0.672 | -0.253 | 0.393 |
| exerciseYes | -1.759 | 0.205 | -8.561 | 0.000 | -2.162 | -1.356 |
| age_group35-49 | 0.178 | 0.288 | 0.618 | 0.537 | -0.387 | 0.743 |
| age_group50-64 | 0.919 | 0.281 | 3.274 | 0.001 | 0.369 | 1.470 |
| age_group65+ | 1.505 | 0.278 | 5.419 | 0.000 | 0.960 | 2.049 |
| educationHS/GED | -0.317 | 0.386 | -0.822 | 0.411 | -1.073 | 0.439 |
| educationSome college | -0.189 | 0.386 | -0.488 | 0.626 | -0.946 | 0.569 |
| educationCollege graduate | -0.190 | 0.379 | -0.501 | 0.616 | -0.934 | 0.553 |
| gen_healthGood | 1.285 | 0.190 | 6.758 | 0.000 | 0.912 | 1.658 |
| gen_healthFair/Poor | 10.349 | 0.251 | 41.219 | 0.000 | 9.857 | 10.842 |
| maritalPreviously married | 0.108 | 0.204 | 0.531 | 0.595 | -0.292 | 0.508 |
| maritalNever married | -0.244 | 0.242 | -1.006 | 0.314 | -0.719 | 0.231 |
# Report and interpret: R-squared, Adjusted R-squared, AIC (Using AIC(), and BIC (using BIC())
glance(max_mod) |>
dplyr::select(r.squared, adj.r.squared, AIC, BIC) |>
kable(digits = 3, caption = "Model Fit Summary") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| r.squared | adj.r.squared | AIC | BIC |
|---|---|---|---|
| 0.323 | 0.322 | 54378.57 | 54490.36 |
The working model explains 32.3% of the variance in physically unhealthy days (r-squared = 0.323), (adjusted r-squared = 0.322).
# Use leaps::regsubsets() to perform best subsets regression on your maximum model
best_subsets <- leaps::regsubsets(
physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + education + gen_health + marital,
data = brfss_analytics,
nvmax = 15,
method = "exhaustive"
)
best_summary <- summary(best_subsets)
# Create a summary table or plot showing Adjusted R-squared and BIC across model sizes (number of predictors)
models <- list(
"Mental health" = lm(physhlth_days ~ menthlth_days, data = brfss_analytics),
"+ bmi" = lm(physhlth_days ~ menthlth_days + bmi, data = brfss_analytics),
"+ sex" = lm(physhlth_days ~ menthlth_days + bmi + sex, data = brfss_analytics),
"+ exercise" = lm(physhlth_days ~ menthlth_days + bmi + sex + exercise, data = brfss_analytics),
"+ age_group" = lm(physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group, data = brfss_analytics),
"+ education" = lm(physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + education, data = brfss_analytics),
"+ general health" = lm(physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + education + gen_health, data = brfss_analytics),
"+ marital (full)" = lm(physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group + education + gen_health + marital, data = brfss_analytics)
)
r2_table <- map_dfr(names(models), \(name) {
g <- glance(models[[name]])
tibble(
Model = name,
p = length(coef(models[[name]])) - 1,
`Adj. R²` = round(g$adj.r.squared, 4),
BIC = round(g$BIC, 1)
)
})
r2_table |>
kable(caption = "Model Comparison") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Model | p | Adj. R² | BIC |
|---|---|---|---|
| Mental health | 1 | 0.0982 | 56667.7 |
|
2 | 0.1084 | 56585.0 |
|
3 | 0.1085 | 56591.8 |
|
4 | 0.1455 | 56260.7 |
|
7 | 0.1637 | 56112.6 |
|
10 | 0.1674 | 56101.1 |
|
12 | 0.3220 | 54474.0 |
|
14 | 0.3220 | 54490.4 |
In this model, once there are 12 parameters, the adjusted R-squared is maximized (0.3220). This means the highest adjusted R-squared was observed in the (+ general health model).
Likewise, in this model, the smallest BIC is observed in the model that includes 12 parameters (BIC = 54474.0). This means the lowest BIC was also observed in the (+ general health model).
Based on the observed Adjusted R-squared and BIC vales among the various models, it appears that the (+ general health model) explains the highest amount of variance in the outcome while having the lowest level of complexity. This means these two criteria agree that the best model size includes the 12 parameters.
# Using the step() function, perform the following three selection procedures. For each, report which variables are in the final model.
# Backward elimination: Start from the maximum model (direction = "backward")
# Start from the maximum model
mod_back <- step(max_mod, direction = "backward", trace = 1)## Start: AIC=31673.55
## physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group +
## education + gen_health + marital
##
## Df Sum of Sq RSS AIC
## - education 3 44 417795 31668
## - marital 2 82 417833 31671
## - sex 1 9 417760 31672
## <none> 417751 31674
## - bmi 1 153 417905 31674
## - age_group 3 2301 420052 31711
## - exercise 1 3835 421586 31745
## - menthlth_days 1 19435 437186 32035
## - gen_health 2 94594 512345 33302
##
## Step: AIC=31668.4
## physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group +
## gen_health + marital
##
## Df Sum of Sq RSS AIC
## - marital 2 84 417879 31666
## - sex 1 10 417805 31667
## <none> 417795 31668
## - bmi 1 153 417948 31669
## - age_group 3 2301 420096 31706
## - exercise 1 3918 421713 31741
## - menthlth_days 1 19459 437254 32031
## - gen_health 2 96640 514435 33329
##
## Step: AIC=31666
## physhlth_days ~ menthlth_days + bmi + sex + exercise + age_group +
## gen_health
##
## Df Sum of Sq RSS AIC
## - sex 1 15 417894 31664
## <none> 417879 31666
## - bmi 1 150 418029 31667
## - age_group 3 3198 421077 31721
## - exercise 1 3924 421803 31739
## - menthlth_days 1 19538 437417 32030
## - gen_health 2 97825 515704 33345
##
## Step: AIC=31664.28
## physhlth_days ~ menthlth_days + bmi + exercise + age_group +
## gen_health
##
## Df Sum of Sq RSS AIC
## <none> 417894 31664
## - bmi 1 149 418043 31665
## - age_group 3 3250 421144 31720
## - exercise 1 3971 421864 31738
## - menthlth_days 1 19882 437776 32034
## - gen_health 2 97843 515737 33343
# Report which variables are in the final model
tidy(mod_back, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Backward Elimination Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.0211 | 0.4526 | 2.2562 | 0.0241 | 0.1339 | 1.9082 |
| menthlth_days | 0.2057 | 0.0106 | 19.4985 | 0.0000 | 0.1851 | 0.2264 |
| bmi | 0.0212 | 0.0126 | 1.6870 | 0.0916 | -0.0034 | 0.0459 |
| exerciseYes | -1.7623 | 0.2022 | -8.7137 | 0.0000 | -2.1588 | -1.3659 |
| age_group35-49 | 0.3061 | 0.2723 | 1.1241 | 0.2610 | -0.2277 | 0.8398 |
| age_group50-64 | 1.0723 | 0.2565 | 4.1804 | 0.0000 | 0.5695 | 1.5751 |
| age_group65+ | 1.6841 | 0.2447 | 6.8820 | 0.0000 | 1.2044 | 2.1638 |
| gen_healthGood | 1.2797 | 0.1888 | 6.7785 | 0.0000 | 0.9096 | 1.6498 |
| gen_healthFair/Poor | 10.3518 | 0.2463 | 42.0277 | 0.0000 | 9.8690 | 10.8347 |
# Forward selection: Start from an intercept-only model with the maximum model as the upper scope (direction = "forward")
# Start from an intercept-only model
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytics)
# Use maxium model as the upper scope (direction = "forward")
mod_forward <- step(mod_null,
scope = list(lower = mod_null, upper = max_mod),
direction = "forward", trace = 1)## Start: AIC=34767.94
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 172936 444258 32142
## + menthlth_days 1 60700 556494 33942
## + exercise 1 36694 580500 34280
## + education 3 10632 606562 34635
## + bmi 1 9513 607682 34646
## + marital 2 8185 609009 34665
## + age_group 3 6839 610355 34685
## + sex 1 1231 615963 34754
## <none> 617194 34768
##
## Step: AIC=32141.72
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 18156.4 426102 31810
## + exercise 1 5642.9 438615 32041
## + age_group 3 984.8 443273 32130
## + sex 1 646.6 443612 32132
## + marital 2 670.1 443588 32134
## + bmi 1 361.0 443897 32137
## <none> 444258 32142
## + education 3 175.3 444083 32145
##
## Step: AIC=31809.89
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 4865.1 421237 31720
## + age_group 3 3865.5 422236 31743
## + marital 2 1206.0 424896 31791
## + bmi 1 291.1 425811 31806
## + sex 1 163.4 425938 31809
## <none> 426102 31810
## + education 3 112.8 425989 31814
##
## Step: AIC=31720.03
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3194.2 418043 31665
## + marital 2 1020.6 420216 31705
## <none> 421237 31720
## + bmi 1 93.0 421144 31720
## + sex 1 63.9 421173 31721
## + education 3 64.9 421172 31725
##
## Step: AIC=31665.13
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + bmi 1 148.834 417894 31664
## <none> 418043 31665
## + sex 1 13.102 418029 31667
## + marital 2 86.081 417956 31667
## + education 3 45.539 417997 31670
##
## Step: AIC=31664.28
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## bmi
##
## Df Sum of Sq RSS AIC
## <none> 417894 31664
## + sex 1 14.648 417879 31666
## + marital 2 89.092 417805 31667
## + education 3 45.895 417848 31669
# Report which variables are in the final model
tidy(mod_forward, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Forward Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.0211 | 0.4526 | 2.2562 | 0.0241 | 0.1339 | 1.9082 |
| gen_healthGood | 1.2797 | 0.1888 | 6.7785 | 0.0000 | 0.9096 | 1.6498 |
| gen_healthFair/Poor | 10.3518 | 0.2463 | 42.0277 | 0.0000 | 9.8690 | 10.8347 |
| menthlth_days | 0.2057 | 0.0106 | 19.4985 | 0.0000 | 0.1851 | 0.2264 |
| exerciseYes | -1.7623 | 0.2022 | -8.7137 | 0.0000 | -2.1588 | -1.3659 |
| age_group35-49 | 0.3061 | 0.2723 | 1.1241 | 0.2610 | -0.2277 | 0.8398 |
| age_group50-64 | 1.0723 | 0.2565 | 4.1804 | 0.0000 | 0.5695 | 1.5751 |
| age_group65+ | 1.6841 | 0.2447 | 6.8820 | 0.0000 | 1.2044 | 2.1638 |
| bmi | 0.0212 | 0.0126 | 1.6870 | 0.0916 | -0.0034 | 0.0459 |
# Stepwise selection: Start from the intercept-only model with both directions allowed (direction = "both")
mod_stepwise <- step(mod_null,
scope = list(lower = mod_null, upper = max_mod),
direction = "both", trace = 1)## Start: AIC=34767.94
## physhlth_days ~ 1
##
## Df Sum of Sq RSS AIC
## + gen_health 2 172936 444258 32142
## + menthlth_days 1 60700 556494 33942
## + exercise 1 36694 580500 34280
## + education 3 10632 606562 34635
## + bmi 1 9513 607682 34646
## + marital 2 8185 609009 34665
## + age_group 3 6839 610355 34685
## + sex 1 1231 615963 34754
## <none> 617194 34768
##
## Step: AIC=32141.72
## physhlth_days ~ gen_health
##
## Df Sum of Sq RSS AIC
## + menthlth_days 1 18156 426102 31810
## + exercise 1 5643 438615 32041
## + age_group 3 985 443273 32130
## + sex 1 647 443612 32132
## + marital 2 670 443588 32134
## + bmi 1 361 443897 32137
## <none> 444258 32142
## + education 3 175 444083 32145
## - gen_health 2 172936 617194 34768
##
## Step: AIC=31809.89
## physhlth_days ~ gen_health + menthlth_days
##
## Df Sum of Sq RSS AIC
## + exercise 1 4865 421237 31720
## + age_group 3 3866 422236 31743
## + marital 2 1206 424896 31791
## + bmi 1 291 425811 31806
## + sex 1 163 425938 31809
## <none> 426102 31810
## + education 3 113 425989 31814
## - menthlth_days 1 18156 444258 32142
## - gen_health 2 130393 556494 33942
##
## Step: AIC=31720.03
## physhlth_days ~ gen_health + menthlth_days + exercise
##
## Df Sum of Sq RSS AIC
## + age_group 3 3194 418043 31665
## + marital 2 1021 420216 31705
## <none> 421237 31720
## + bmi 1 93 421144 31720
## + sex 1 64 421173 31721
## + education 3 65 421172 31725
## - exercise 1 4865 426102 31810
## - menthlth_days 1 17379 438615 32041
## - gen_health 2 108844 530080 33555
##
## Step: AIC=31665.13
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group
##
## Df Sum of Sq RSS AIC
## + bmi 1 149 417894 31664
## <none> 418043 31665
## + sex 1 13 418029 31667
## + marital 2 86 417956 31667
## + education 3 46 417997 31670
## - age_group 3 3194 421237 31720
## - exercise 1 4194 422236 31743
## - menthlth_days 1 19893 437935 32035
## - gen_health 2 100730 518772 33388
##
## Step: AIC=31664.28
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## bmi
##
## Df Sum of Sq RSS AIC
## <none> 417894 31664
## - bmi 1 149 418043 31665
## + sex 1 15 417879 31666
## + marital 2 89 417805 31667
## + education 3 46 417848 31669
## - age_group 3 3250 421144 31720
## - exercise 1 3971 421864 31738
## - menthlth_days 1 19882 437776 32034
## - gen_health 2 97843 515737 33343
# Report which variables are in the final model
tidy(mod_stepwise, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Stepwise Selection Result (AIC-based)",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.0211 | 0.4526 | 2.2562 | 0.0241 | 0.1339 | 1.9082 |
| gen_healthGood | 1.2797 | 0.1888 | 6.7785 | 0.0000 | 0.9096 | 1.6498 |
| gen_healthFair/Poor | 10.3518 | 0.2463 | 42.0277 | 0.0000 | 9.8690 | 10.8347 |
| menthlth_days | 0.2057 | 0.0106 | 19.4985 | 0.0000 | 0.1851 | 0.2264 |
| exerciseYes | -1.7623 | 0.2022 | -8.7137 | 0.0000 | -2.1588 | -1.3659 |
| age_group35-49 | 0.3061 | 0.2723 | 1.1241 | 0.2610 | -0.2277 | 0.8398 |
| age_group50-64 | 1.0723 | 0.2565 | 4.1804 | 0.0000 | 0.5695 | 1.5751 |
| age_group65+ | 1.6841 | 0.2447 | 6.8820 | 0.0000 | 1.2044 | 2.1638 |
| bmi | 0.0212 | 0.0126 | 1.6870 | 0.0916 | -0.0034 | 0.0459 |
While all three models excluded the same three predictors (sex, marital, and education), and retain the same six predictors (general health, mental health, exercise, age group, and BMI), only the forward and stepwise selections had the same order of entry general health (the strongest predictor), followed by mental health, exercise, age group, and BMI. In comparison, the backward elimination had listed the retained variables based on the order in which the were entered into the model regardless of their contribution to the model (mental health days, BMI exercise, age group, and general health status).
In choosing my final model, I will use the variables
menthlth_days, bmi, exercise,
age_group, and gen_health as predictors for
the outcome physhlth_days. This choice was made using the
backward elimination (AIC), forward (AIC), and stepwise (AIC) testing as
each test displayed the same adjusted r-squared (0.3222), AIC (54369.3),
and BIC (54439.2) which were lower than the adjusted r-squared (0.3220),
AIC (54378.6), and BIC (54490.4) of the full model. Therefore, the final
model includes only the variables retained by these tests, as they
explain the most variance in the outcome while maintaining the lowest
complexity.
method_comparison <- tribble(
~Method, ~`Variables selected`, ~`Adj. R²`, ~AIC, ~BIC,
"Maximum model",
length(coef(max_mod)) - 1,
round(glance(max_mod)$adj.r.squared, 4),
round(AIC(max_mod), 1),
round(BIC(max_mod), 1),
"Backward (AIC)",
length(coef(mod_back)) - 1,
round(glance(mod_back)$adj.r.squared, 4),
round(AIC(mod_back), 1),
round(BIC(mod_back), 1),
"Forward (AIC)",
length(coef(mod_forward)) - 1,
round(glance(mod_forward)$adj.r.squared, 4),
round(AIC(mod_forward), 1),
round(BIC(mod_forward), 1),
"Stepwise (AIC)",
length(coef(mod_stepwise)) - 1,
round(glance(mod_stepwise)$adj.r.squared, 4),
round(AIC(mod_stepwise), 1),
round(BIC(mod_stepwise), 1)
)
method_comparison |>
kable(caption = "Comparison of Variable Selection Methods") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Method | Variables selected | Adj. R² | AIC | BIC |
|---|---|---|---|---|
| Maximum model | 14 | 0.3220 | 54378.6 | 54490.4 |
| Backward (AIC) | 8 | 0.3222 | 54369.3 | 54439.2 |
| Forward (AIC) | 8 | 0.3222 | 54369.3 | 54439.2 |
| Stepwise (AIC) | 8 | 0.3222 | 54369.3 | 54439.2 |
# Fit the final model
final_mod <- lm(physhlth_days ~ menthlth_days + bmi + exercise + age_group + gen_health,
data = brfss_analytics)
# Display the coefficient table for your final model using tidy() with confidence intervals
tidy(final_mod, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Maximum Model: All Candidate Predictors",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 1.0211 | 0.4526 | 2.2562 | 0.0241 | 0.1339 | 1.9082 |
| menthlth_days | 0.2057 | 0.0106 | 19.4985 | 0.0000 | 0.1851 | 0.2264 |
| bmi | 0.0212 | 0.0126 | 1.6870 | 0.0916 | -0.0034 | 0.0459 |
| exerciseYes | -1.7623 | 0.2022 | -8.7137 | 0.0000 | -2.1588 | -1.3659 |
| age_group35-49 | 0.3061 | 0.2723 | 1.1241 | 0.2610 | -0.2277 | 0.8398 |
| age_group50-64 | 1.0723 | 0.2565 | 4.1804 | 0.0000 | 0.5695 | 1.5751 |
| age_group65+ | 1.6841 | 0.2447 | 6.8820 | 0.0000 | 1.2044 | 2.1638 |
| gen_healthGood | 1.2797 | 0.1888 | 6.7785 | 0.0000 | 0.9096 | 1.6498 |
| gen_healthFair/Poor | 10.3518 | 0.2463 | 42.0277 | 0.0000 | 9.8690 | 10.8347 |
In the final model, for every one unit increase in BMI, there is an observed 0.0212 day increase in the number of physically unhealthy days an individual will report, holding all other variables constant. Additionally, compared to individuals who do not exercise, individuals who do exercise will report on average 1.7623 fewer physically unhealthy days, holding all other variables constant.Finally, individuals who report having a Fair/Poor general health status are more likely to report 10.3518 more physically unhealthy days on average compared to individuals who reporting having a Excellent/VG general health status, holding all other variables constant.
# Using your final linear model, produce the four base R diagnostic plots
par(mfrow = c(2, 2))
plot(final_mod )Looking at the residuals vs, fitted graph, the loess line only partially lies near zero with some observed curves which suggests some evidence of non-linearity. However, there also appears to be a slight funnel shape within the residuals which would suggest heteroscedasticity (non-constant variance).
The residuals do not approximately follow a normal distribution, but rather they show an s-shaped curve indicating the residuals are heavily skewed to the right as well as moderately skewed to the left.
The spread of residuals is not roughly constant across fitted values. This is indicated by the downward sloping of the loess line. This means that the spread of residuals increases systematically with fitted values.
There appears to be highly influential observations as indicated by the data points that are far from the loess line.
In conclusion, the LINE assumptions for this final model are not reasonably satisfied. This is because there were violations of linearity, normality, homoscedasicity, and the presence of influential observations, as indicated in the panel above.
# Create a new variable frequent_distress: 1 if physhlth_days >= 14, 0 if physhlth_days < 14, Store as a factor with levels "No" (reference) and "Yes" (See re-coding section above)
# Report the prevalence of frequent physical distress (count and percentage overall)
tibble(Metric = c("Observations", "Variables", "FPD Cases", "FPD Prevalence"),
Value = c(nrow(brfss_analytics), ncol(brfss_analytics),
sum(brfss_analytics$fpd == "Yes"),
paste0(round(100 * mean(brfss_analytics$fpd == "Yes"), 1), "%"))) |>
kable(caption = "FDP Dataset Overview") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Observations | 8000 |
| Variables | 18 |
| FPD Cases | 1072 |
| FPD Prevalence | 13.4% |
# Report the prevalence of frequent physical distress (count and percentage in each category)
brfss_fpd <- brfss_analytics |>
gtsummary::tbl_summary(
by = fpd,
include = c(physhlth_days, menthlth_days, bmi, exercise, age_group, gen_health),
statistic = list(
all_continuous() ~ "{mean} ({sd})"
),
label = list(
physhlth_days ~ "Physical unhealthy days",
menthlth_days ~ "Mental unhealthy days",
bmi ~ "BMI",
exercise ~ "Exercise in past 30 days",
age_group ~ "Age group",
gen_health ~ "General health status"
)
) |>
add_overall() |>
add_p() |>
bold_labels()
print(brfss_fpd)## <div id="cxthcybgcc" style="padding-left:0px;padding-right:0px;padding-top:10px;padding-bottom:10px;overflow-x:auto;overflow-y:auto;width:auto;height:auto;">
## <style>#cxthcybgcc table {
## font-family: system-ui, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji';
## -webkit-font-smoothing: antialiased;
## -moz-osx-font-smoothing: grayscale;
## }
##
## #cxthcybgcc thead, #cxthcybgcc tbody, #cxthcybgcc tfoot, #cxthcybgcc tr, #cxthcybgcc td, #cxthcybgcc th {
## border-style: none;
## }
##
## #cxthcybgcc p {
## margin: 0;
## padding: 0;
## }
##
## #cxthcybgcc .gt_table {
## display: table;
## border-collapse: collapse;
## line-height: normal;
## margin-left: auto;
## margin-right: auto;
## color: #333333;
## font-size: 13px;
## font-weight: normal;
## font-style: normal;
## background-color: #FFFFFF;
## width: auto;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #A8A8A8;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #A8A8A8;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_caption {
## padding-top: 4px;
## padding-bottom: 4px;
## }
##
## #cxthcybgcc .gt_title {
## color: #333333;
## font-size: 125%;
## font-weight: initial;
## padding-top: 4px;
## padding-bottom: 4px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-color: #FFFFFF;
## border-bottom-width: 0;
## }
##
## #cxthcybgcc .gt_subtitle {
## color: #333333;
## font-size: 85%;
## font-weight: initial;
## padding-top: 3px;
## padding-bottom: 5px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-color: #FFFFFF;
## border-top-width: 0;
## }
##
## #cxthcybgcc .gt_heading {
## background-color: #FFFFFF;
## text-align: center;
## border-bottom-color: #FFFFFF;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_bottom_border {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_col_headings {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_col_heading {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 6px;
## padding-left: 5px;
## padding-right: 5px;
## overflow-x: hidden;
## }
##
## #cxthcybgcc .gt_column_spanner_outer {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: normal;
## text-transform: inherit;
## padding-top: 0;
## padding-bottom: 0;
## padding-left: 4px;
## padding-right: 4px;
## }
##
## #cxthcybgcc .gt_column_spanner_outer:first-child {
## padding-left: 0;
## }
##
## #cxthcybgcc .gt_column_spanner_outer:last-child {
## padding-right: 0;
## }
##
## #cxthcybgcc .gt_column_spanner {
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: bottom;
## padding-top: 5px;
## padding-bottom: 5px;
## overflow-x: hidden;
## display: inline-block;
## width: 100%;
## }
##
## #cxthcybgcc .gt_spanner_row {
## border-bottom-style: hidden;
## }
##
## #cxthcybgcc .gt_group_heading {
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## text-align: left;
## }
##
## #cxthcybgcc .gt_empty_group_heading {
## padding: 0.5px;
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## vertical-align: middle;
## }
##
## #cxthcybgcc .gt_from_md > :first-child {
## margin-top: 0;
## }
##
## #cxthcybgcc .gt_from_md > :last-child {
## margin-bottom: 0;
## }
##
## #cxthcybgcc .gt_row {
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## margin: 10px;
## border-top-style: solid;
## border-top-width: 1px;
## border-top-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 1px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 1px;
## border-right-color: #D3D3D3;
## vertical-align: middle;
## overflow-x: hidden;
## }
##
## #cxthcybgcc .gt_stub {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cxthcybgcc .gt_stub_row_group {
## color: #333333;
## background-color: #FFFFFF;
## font-size: 100%;
## font-weight: initial;
## text-transform: inherit;
## border-right-style: solid;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## padding-left: 5px;
## padding-right: 5px;
## vertical-align: top;
## }
##
## #cxthcybgcc .gt_row_group_first td {
## border-top-width: 2px;
## }
##
## #cxthcybgcc .gt_row_group_first th {
## border-top-width: 2px;
## }
##
## #cxthcybgcc .gt_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cxthcybgcc .gt_first_summary_row {
## border-top-style: solid;
## border-top-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_first_summary_row.thick {
## border-top-width: 2px;
## }
##
## #cxthcybgcc .gt_last_summary_row {
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_grand_summary_row {
## color: #333333;
## background-color: #FFFFFF;
## text-transform: inherit;
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cxthcybgcc .gt_first_grand_summary_row {
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## border-top-style: double;
## border-top-width: 6px;
## border-top-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_last_grand_summary_row_top {
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## border-bottom-style: double;
## border-bottom-width: 6px;
## border-bottom-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_striped {
## background-color: rgba(128, 128, 128, 0.05);
## }
##
## #cxthcybgcc .gt_table_body {
## border-top-style: solid;
## border-top-width: 2px;
## border-top-color: #D3D3D3;
## border-bottom-style: solid;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_footnotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_footnote {
## margin: 0px;
## font-size: 90%;
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cxthcybgcc .gt_sourcenotes {
## color: #333333;
## background-color: #FFFFFF;
## border-bottom-style: none;
## border-bottom-width: 2px;
## border-bottom-color: #D3D3D3;
## border-left-style: none;
## border-left-width: 2px;
## border-left-color: #D3D3D3;
## border-right-style: none;
## border-right-width: 2px;
## border-right-color: #D3D3D3;
## }
##
## #cxthcybgcc .gt_sourcenote {
## font-size: 90%;
## padding-top: 1px;
## padding-bottom: 1px;
## padding-left: 5px;
## padding-right: 5px;
## }
##
## #cxthcybgcc .gt_left {
## text-align: left;
## }
##
## #cxthcybgcc .gt_center {
## text-align: center;
## }
##
## #cxthcybgcc .gt_right {
## text-align: right;
## font-variant-numeric: tabular-nums;
## }
##
## #cxthcybgcc .gt_font_normal {
## font-weight: normal;
## }
##
## #cxthcybgcc .gt_font_bold {
## font-weight: bold;
## }
##
## #cxthcybgcc .gt_font_italic {
## font-style: italic;
## }
##
## #cxthcybgcc .gt_super {
## font-size: 65%;
## }
##
## #cxthcybgcc .gt_footnote_marks {
## font-size: 75%;
## vertical-align: 0.4em;
## position: initial;
## }
##
## #cxthcybgcc .gt_asterisk {
## font-size: 100%;
## vertical-align: 0;
## }
##
## #cxthcybgcc .gt_indent_1 {
## text-indent: 5px;
## }
##
## #cxthcybgcc .gt_indent_2 {
## text-indent: 10px;
## }
##
## #cxthcybgcc .gt_indent_3 {
## text-indent: 15px;
## }
##
## #cxthcybgcc .gt_indent_4 {
## text-indent: 20px;
## }
##
## #cxthcybgcc .gt_indent_5 {
## text-indent: 25px;
## }
##
## #cxthcybgcc .katex-display {
## display: inline-flex !important;
## margin-bottom: 0.75em !important;
## }
##
## #cxthcybgcc div.Reactable > div.rt-table > div.rt-thead > div.rt-tr.rt-tr-group-header > div.rt-th-group:after {
## height: 0px !important;
## }
## </style>
## <table class="gt_table" data-quarto-disable-processing="false" data-quarto-bootstrap="false">
## <thead>
## <tr class="gt_col_headings">
## <th class="gt_col_heading gt_columns_bottom_border gt_left" rowspan="1" colspan="1" scope="col" id="label"><span class='gt_from_md'><strong>Characteristic</strong></span></th>
## <th class="gt_col_heading gt_columns_bottom_border gt_center" rowspan="1" colspan="1" scope="col" id="stat_0"><span class='gt_from_md'><strong>Overall</strong><br />
## N = 8,000</span><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span></th>
## <th class="gt_col_heading gt_columns_bottom_border gt_center" rowspan="1" colspan="1" scope="col" id="stat_1"><span class='gt_from_md'><strong>No</strong><br />
## N = 6,928</span><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span></th>
## <th class="gt_col_heading gt_columns_bottom_border gt_center" rowspan="1" colspan="1" scope="col" id="stat_2"><span class='gt_from_md'><strong>Yes</strong><br />
## N = 1,072</span><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span></th>
## <th class="gt_col_heading gt_columns_bottom_border gt_center" rowspan="1" colspan="1" scope="col" id="p.value"><span class='gt_from_md'><strong>p-value</strong></span><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>2</sup></span></th>
## </tr>
## </thead>
## <tbody class="gt_table_body">
## <tr><td headers="label" class="gt_row gt_left" style="font-weight: bold;">Physical unhealthy days</td>
## <td headers="stat_0" class="gt_row gt_center">4 (9)</td>
## <td headers="stat_1" class="gt_row gt_center">1 (2)</td>
## <td headers="stat_2" class="gt_row gt_center">25 (6)</td>
## <td headers="p.value" class="gt_row gt_center"><0.001</td></tr>
## <tr><td headers="label" class="gt_row gt_left" style="font-weight: bold;">Mental unhealthy days</td>
## <td headers="stat_0" class="gt_row gt_center">4 (8)</td>
## <td headers="stat_1" class="gt_row gt_center">3 (7)</td>
## <td headers="stat_2" class="gt_row gt_center">10 (12)</td>
## <td headers="p.value" class="gt_row gt_center"><0.001</td></tr>
## <tr><td headers="label" class="gt_row gt_left" style="font-weight: bold;">BMI</td>
## <td headers="stat_0" class="gt_row gt_center">29 (7)</td>
## <td headers="stat_1" class="gt_row gt_center">28 (6)</td>
## <td headers="stat_2" class="gt_row gt_center">30 (8)</td>
## <td headers="p.value" class="gt_row gt_center"><0.001</td></tr>
## <tr><td headers="label" class="gt_row gt_left" style="font-weight: bold;">Exercise in past 30 days</td>
## <td headers="stat_0" class="gt_row gt_center">6,120 (77%)</td>
## <td headers="stat_1" class="gt_row gt_center">5,553 (80%)</td>
## <td headers="stat_2" class="gt_row gt_center">567 (53%)</td>
## <td headers="p.value" class="gt_row gt_center"><0.001</td></tr>
## <tr><td headers="label" class="gt_row gt_left" style="font-weight: bold;">Age group</td>
## <td headers="stat_0" class="gt_row gt_center"><br /></td>
## <td headers="stat_1" class="gt_row gt_center"><br /></td>
## <td headers="stat_2" class="gt_row gt_center"><br /></td>
## <td headers="p.value" class="gt_row gt_center"><0.001</td></tr>
## <tr><td headers="label" class="gt_row gt_left"> 18-34</td>
## <td headers="stat_0" class="gt_row gt_center">1,355 (17%)</td>
## <td headers="stat_1" class="gt_row gt_center">1,249 (18%)</td>
## <td headers="stat_2" class="gt_row gt_center">106 (9.9%)</td>
## <td headers="p.value" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left"> 35-49</td>
## <td headers="stat_0" class="gt_row gt_center">1,522 (19%)</td>
## <td headers="stat_1" class="gt_row gt_center">1,377 (20%)</td>
## <td headers="stat_2" class="gt_row gt_center">145 (14%)</td>
## <td headers="p.value" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left"> 50-64</td>
## <td headers="stat_0" class="gt_row gt_center">2,108 (26%)</td>
## <td headers="stat_1" class="gt_row gt_center">1,784 (26%)</td>
## <td headers="stat_2" class="gt_row gt_center">324 (30%)</td>
## <td headers="p.value" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left"> 65+</td>
## <td headers="stat_0" class="gt_row gt_center">3,015 (38%)</td>
## <td headers="stat_1" class="gt_row gt_center">2,518 (36%)</td>
## <td headers="stat_2" class="gt_row gt_center">497 (46%)</td>
## <td headers="p.value" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left" style="font-weight: bold;">General health status</td>
## <td headers="stat_0" class="gt_row gt_center"><br /></td>
## <td headers="stat_1" class="gt_row gt_center"><br /></td>
## <td headers="stat_2" class="gt_row gt_center"><br /></td>
## <td headers="p.value" class="gt_row gt_center"><0.001</td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Excellent/VG</td>
## <td headers="stat_0" class="gt_row gt_center">3,954 (49%)</td>
## <td headers="stat_1" class="gt_row gt_center">3,821 (55%)</td>
## <td headers="stat_2" class="gt_row gt_center">133 (12%)</td>
## <td headers="p.value" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Good</td>
## <td headers="stat_0" class="gt_row gt_center">2,576 (32%)</td>
## <td headers="stat_1" class="gt_row gt_center">2,335 (34%)</td>
## <td headers="stat_2" class="gt_row gt_center">241 (22%)</td>
## <td headers="p.value" class="gt_row gt_center"><br /></td></tr>
## <tr><td headers="label" class="gt_row gt_left"> Fair/Poor</td>
## <td headers="stat_0" class="gt_row gt_center">1,470 (18%)</td>
## <td headers="stat_1" class="gt_row gt_center">772 (11%)</td>
## <td headers="stat_2" class="gt_row gt_center">698 (65%)</td>
## <td headers="p.value" class="gt_row gt_center"><br /></td></tr>
## </tbody>
## <tfoot>
## <tr class="gt_footnotes">
## <td class="gt_footnote" colspan="5"><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>1</sup></span> <span class='gt_from_md'>Mean (SD); n (%)</span></td>
## </tr>
## <tr class="gt_footnotes">
## <td class="gt_footnote" colspan="5"><span class="gt_footnote_marks" style="white-space:nowrap;font-style:italic;font-weight:normal;line-height:0;"><sup>2</sup></span> <span class='gt_from_md'>Wilcoxon rank sum test; Pearson’s Chi-squared test</span></td>
## </tr>
## </tfoot>
## </table>
## </div>
Based on the FDP dataset overview, it appears that there are 1072 frequent physical distress (FPD) cases among the 8000 observations with an overall FPD prevalence of 13.4% among all observations. When looking at the frequency and prevalence of FPD across each variable in the final model (physhlth_days, menthlth_days, bmi, exercise, age_group, gen_health), they appear to be highest within the general health status “Fair/Poor” and the “65+” age group categories.
Out of all of the variables from the linear model (physhlth_days, menthlth_days, bmi, exercise, age_group, gen_health), I am selecting the predictor gen_health. I am selecting this predictor because its categories were determined to have the highest coefficient estimate during the backward elimination (AIC), forward (AIC), and stepwise (AIC) testing.
# Fit a simple logistic regression between frequent physical distress and gen_health
log_simp <- glm(fpd ~ gen_health,
data = brfss_analytics, family = binomial
)
# Display the model summary
summary(log_simp)##
## Call:
## glm(formula = fpd ~ gen_health, family = binomial, data = brfss_analytics)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.35792 0.08821 -38.069 <2e-16 ***
## gen_healthGood 1.08695 0.11117 9.778 <2e-16 ***
## gen_healthFair/Poor 3.25715 0.10251 31.774 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6302.7 on 7999 degrees of freedom
## Residual deviance: 4798.6 on 7997 degrees of freedom
## AIC: 4804.6
##
## Number of Fisher Scoring iterations: 6
# Calculate and report the odds ratio and 95% confidence interval
tidy(log_simp, conf.int = TRUE, exponentiate = TRUE) |>
mutate(across(c(estimate, std.error, statistic, conf.low, conf.high),
\(x) round(x, 3)),
p.value = format.pval(p.value, digits = 3)) |>
kable(caption = "Wald Tests for Each Coefficient (Exponentiated)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.035 | 0.088 | -38.069 | <2e-16 | 0.029 | 0.041 |
| gen_healthGood | 2.965 | 0.111 | 9.778 | <2e-16 | 2.389 | 3.695 |
| gen_healthFair/Poor | 25.975 | 0.103 | 31.774 | <2e-16 | 21.317 | 31.869 |
Compared to participants who reported having a excellent/VG general health status, participants who reported having a good general health status have 2.97 times the odds of frequent distresss, 95% CI [2.39, 3.70]. Compared to participants who reported having a excellent/VG general health status, participants who reported having a fair/poor general health status have about 26.0 times the odds of frequent physical distress, 95% CI [21.3, 31.9].
This association is statistically significant at alpha = 0.05. This is because both sub categories (Good and Fair/Poor) within the general health status have CI’s excluding 1 as well as Wald test p-values that are below 0.05.
# Fit a multiple logistic regression using the same predictors as your final linear model from Part 1:
mod_logistic <- glm(fpd ~ menthlth_days + bmi + exercise + age_group + gen_health, data = brfss_analytics, family = binomial)
# Display the model summary
summary(mod_logistic)##
## Call:
## glm(formula = fpd ~ menthlth_days + bmi + exercise + age_group +
## gen_health, family = binomial, data = brfss_analytics)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.745112 0.218626 -17.130 < 2e-16 ***
## menthlth_days 0.054597 0.003900 13.999 < 2e-16 ***
## bmi 0.004266 0.005285 0.807 0.420
## exerciseYes -0.510469 0.082226 -6.208 5.36e-10 ***
## age_group35-49 0.177971 0.153920 1.156 0.248
## age_group50-64 0.556429 0.138784 4.009 6.09e-05 ***
## age_group65+ 0.847268 0.134135 6.317 2.68e-10 ***
## gen_healthGood 0.843142 0.114447 7.367 1.74e-13 ***
## gen_healthFair/Poor 2.719404 0.109253 24.891 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6302.7 on 7999 degrees of freedom
## Residual deviance: 4528.2 on 7991 degrees of freedom
## AIC: 4546.2
##
## Number of Fisher Scoring iterations: 6
# Calculate and display a table of adjusted odds ratios with 95% confidence intervals for all predictors:
mod_logistic |>
tbl_regression(
exponentiate = TRUE,
label = list(
menthlth_days ~ "Mental unhealthy days",
bmi ~ "BMI",
exercise ~ "Exercise in past 30 days",
age_group ~ "Age group",
gen_health ~ "General health status"
)
) |>
bold_labels() |>
bold_p()| Characteristic | OR | 95% CI | p-value |
|---|---|---|---|
| Mental unhealthy days | 1.06 | 1.05, 1.06 | <0.001 |
| BMI | 1.00 | 0.99, 1.01 | 0.4 |
| Exercise in past 30 days | |||
| No | — | — | |
| Yes | 0.60 | 0.51, 0.71 | <0.001 |
| Age group | |||
| 18-34 | — | — | |
| 35-49 | 1.19 | 0.88, 1.62 | 0.2 |
| 50-64 | 1.74 | 1.33, 2.30 | <0.001 |
| 65+ | 2.33 | 1.80, 3.05 | <0.001 |
| General health status | |||
| Excellent/VG | — | — | |
| Good | 2.32 | 1.86, 2.91 | <0.001 |
| Fair/Poor | 15.2 | 12.3, 18.9 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
Based on the adjusted odds ratios, each one-unit increase in mentally unhealthy days is associated with 1.06 times the odds (a 6% increase) of experiencing frequent physical distress, after adjusting for BMI, exercise, age, and general health status. Additionally, individuals who exercised in the past 30 days have 0.60 times the odds (a 40% lower likelihood) of frequent physical distress, controlling for the same factors, indicating a protective effect.
Regarding general health status, individuals reporting Fair/Poor health have 15.2 times the odds of experiencing frequent physical distress compared to those reporting Excellent/Very Good health, holding all other variables constant, indicating a strong risk factor. Similarly, those reporting Good health have 2.32 times the odds of frequent physical distress compared to individuals with Excellent/Very Good health, holding all other variables constant.
# Choose a group of related predictors to test collectively (for example, all income levels, all education levels, or all smoking levels) -> all age_group levels
# Fit a reduced model that excludes this group (age_group)
mod_reduced <- glm(fpd ~ menthlth_days + bmi + exercise + gen_health, data = brfss_analytics, family = binomial)
# Conduct a likelihood ratio test comparing the reduced and full models:
anova(mod_reduced, mod_logistic, test = "Chisq")## Analysis of Deviance Table
##
## Model 1: fpd ~ menthlth_days + bmi + exercise + gen_health
## Model 2: fpd ~ menthlth_days + bmi + exercise + age_group + gen_health
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7994 4586.6
## 2 7991 4528.2 3 58.383 1.302e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Report the test statistic (deviance), degrees of freedom, and p-value
anova(mod_reduced, mod_logistic, test = "LRT") |>
kable(digits = 3,
caption = "LR Test: Does adding exercise + smoker improve the model?") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Resid. Df | Resid. Dev | Df | Deviance | Pr(>Chi) |
|---|---|---|---|---|
| 7994 | 4586.561 | NA | NA | NA |
| 7991 | 4528.178 | 3 | 58.383 | 0 |
H₀: The model without age_group is sufficient (this group of predictors does not significantly improve the model) H₁: Including age_group improves model fit (this group of predictors does significantly improve the model)
At alpha = 0.05, age_group does significantly improve the model. For example, a small p-value means the dropped variables jointly contribute to model fit, and after dropping the age_group variable as a predictor, Pr(>chi) = 0.
# Use the pROC package to generate a ROC curve for your multiple logistic regression model:
roc_obj <- roc(brfss_analytics$fpd, fitted(mod_logistic))
plot(roc_obj, main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE)## Area under the curve: 0.8494
## 95% CI: 0.8362-0.8627 (DeLong)
The AUC of the model is (AUC = 0.8494). Using the provided guide within the assignment, since this value falls between 0.8-0.9, this means that the model has excellent discrimination which means that it is excellent at discriminating between individuals with and without frequent physical distress.
# Conduct the Hosmer-Lemeshow test using the ResourceSelection package:
hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.0826, df = 8, p-value = 0.7487
Test statistic (X-squared = 5.0826) df = 8 p-value = 0.7487
Null hypothesis (H0):The logistic regression model fits the data well (i.e., observed and predicted outcomes are similar across groups).
Alternative hypothesis (H1):The logistic regression model does not fit the data well (i.e., observed and predicted outcomes differ across groups).
At alpha = 0.05, there is no evidence of poor model fit since the Hosmer-Lemeshow results demonstrated a non-significant p-value (p-value = 0.7487) which indicates adequate fit.
The Hosmer-Lemeshow results complement the ROC/AUC findings because they test for calibration, or how well predicted probabilities agree with the observed outcomes, in comparison to ROC/AUC, which evaluates a model’s discrimination ability, or how well the model distinguishes between those with and without the outcome. Based on the findings of this model specifically, this means it can provide accurate probability estimates while excellently distinguishing between our two outcome groups (frequent physical distress and no frequent physical distress).
In reference to these results, about 32% of the variance in the
physical health burden among U.S. adults is explained by factors,
including mentally unhealthy days in the last 30 days, BMI, sex,
exercise status, age, education, general health status, and marital
status. However, after reducing this model to ensure that it would
explain the highest variance (about 32%) in physically unhealthy days
while maintaining the lowest complexity, only the factors
mental_health_days, bmi,
exercise, age_group, and
gen_health remained in both the linear and logistic models.
The strongest predictor identified in both the linear and logistic
models included the general health status category. No predictors were
significant in one model but were in the other. The key limitations of
using cross-sectional survey data in this analysis are that we cannot
establish temporality (cause-and-effect), there may be additional
confounding factors not addressed in the model, and the data are subject
to self-report bias. Two potential confounders that were not measured in
the model include occupational title and dietary patterns.
Logistic regression is used when the outcome is binary (e.g., frequent vs. no frequent physical distress) and estimates the probability of the outcome using adjusted odds ratios, allowing us to interpret how predictors are associated with the odds of experiencing the outcome while controlling for confounders. In contrast, linear regression is used for continuous outcomes (e.g., physically unhealthy days) and estimates the average change in the outcome per unit increase in a predictor.
Linear regression provides directly interpretable effect sizes in the original units of the outcome, whereas logistic regression provides odds ratios and predicted probabilities bounded between 0 and 1. Linear models may produce invalid probability predictions (below 0 or above 1) when used for binary outcomes, which is why logistic regression is preferred in those cases. Model evaluation also differs: linear regression commonly uses R² and residual diagnostics, while logistic regression relies on measures such as AUC (discrimination) and calibration metrics like the Hosmer–Lemeshow test.
OpenAI. (2026). ChatGPT (March 22-23 version) [Large language model]. chatgpt.com
Prompts: “What is wrong with my R code [“inserted originally attempted code”]
Where used: {r recode}, {r step 2}, {r step 2-1}, and {r step 2-3} chunk sections.
Responses: after uploading each of my original codes, ChatGpt had provided me with a clean version of each of my attempted codes which are displayed above. I verified the correctness of the output after ensuring the code ran as intended and comparing my results to another student within the course that had used the same predictors of interest. Additionally, Chatgpt provides a summary of why my original code was not running as intended which allowed me to determine where my formatting issues had preventing my code from running.