summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
plot(pressure)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.2
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.2
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(nnet)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:gtsummary':
##
## select
##
## The following object is masked from 'package:dplyr':
##
## select
library(brant)
## Warning: package 'brant' was built under R version 4.5.3
library(ggeffects)
## Warning: package 'ggeffects' was built under R version 4.5.2
library(dplyr)
library(nnet)
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.5.3
## ResourceSelection 0.3-6 2023-06-27
library(leaps)
## Warning: package 'leaps' was built under R version 4.5.3
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(haven)
## Warning: package 'haven' was built under R version 4.5.2
brfss_data <- read_xpt("C:\\Users\\safwa\\OneDrive - University at Albany - SUNY\\EPI 553\\Labs\\brfss dataset\\LLCP2023.XPT")
# Select only the variables needed for this assignment
brfss_selected <- brfss_data |>
dplyr::select(
PHYSHLTH,
MENTHLTH,
`_BMI5`,
SEXVAR,
EXERANY2,
ADDEPEV3,
`_AGEG5YR`,
`_INCOMG1`,
EDUCA,
`_SMOKER3`,
GENHLTH,
MARITAL
)
cat("Selected dataset dimensions:\n")
## Selected dataset dimensions:
cat(" Rows: ", format(nrow(brfss_selected), big.mark = ","), "\n")
## Rows: 433,323
cat(" Columns:", ncol(brfss_selected), "\n")
## Columns: 12
brfss_clean <- brfss_selected |>
mutate(
# Outcome: physically unhealthy days
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH %in% c(77, 99) ~ NA_real_,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_
),
# Continuous: mentally unhealthy days
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH %in% c(77, 99) ~ NA_real_,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_
),
# Continuous: BMI (divide by 100)
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_,
TRUE ~ as.numeric(`_BMI5`) / 100
),
# Binary: sex
sex = factor(
case_when(SEXVAR == 1 ~ "Male", SEXVAR == 2 ~ "Female"),
levels = c("Male", "Female")
),
# Binary: exercise
exercise = factor(
case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No",
EXERANY2 %in% c(7, 9) ~ NA_character_
),
levels = c("No", "Yes")
),
# Binary: depression history
depression = factor(
case_when(
ADDEPEV3 == 1 ~ "Yes",
ADDEPEV3 == 2 ~ "No",
ADDEPEV3 %in% c(7, 9) ~ NA_character_
),
levels = c("No", "Yes")
),
# Categorical: age group
age_group = factor(
case_when(
`_AGEG5YR` %in% 1:3 ~ "18-34",
`_AGEG5YR` %in% 4:6 ~ "35-49",
`_AGEG5YR` %in% 7:9 ~ "50-64",
`_AGEG5YR` %in% 10:13 ~ "65+",
`_AGEG5YR` == 14 ~ NA_character_
),
levels = c("18-34", "35-49", "50-64", "65+")
),
# Categorical: income
income = factor(
case_when(
`_INCOMG1` %in% 1:2 ~ "Less than $25K",
`_INCOMG1` %in% 3:4 ~ "$25K-$49K",
`_INCOMG1` %in% 5:6 ~ "$50K-$99K",
`_INCOMG1` == 7 ~ "$100K+",
`_INCOMG1` == 9 ~ NA_character_
),
levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")
),
# Categorical: education
education = factor(
case_when(
EDUCA %in% 1:3 ~ "Less than HS",
EDUCA == 4 ~ "HS/GED",
EDUCA == 5 ~ "Some college",
EDUCA == 6 ~ "College graduate",
EDUCA == 9 ~ NA_character_
),
levels = c("Less than HS", "HS/GED", "Some college", "College graduate")
),
# Categorical: smoking
smoking = factor(
case_when(
`_SMOKER3` %in% 1:2 ~ "Current",
`_SMOKER3` == 3 ~ "Former",
`_SMOKER3` == 4 ~ "Never",
`_SMOKER3` == 9 ~ NA_character_
),
levels = c("Never", "Former", "Current")
),
# Categorical: general health
gen_health = factor(
case_when(
GENHLTH %in% 1:2 ~ "Excellent/Very Good",
GENHLTH == 3 ~ "Good",
GENHLTH %in% 4:5 ~ "Fair/Poor",
GENHLTH %in% c(7, 9) ~ NA_character_
),
levels = c("Excellent/Very Good", "Good", "Fair/Poor")
),
# Categorical: marital status
marital = factor(
case_when(
MARITAL %in% c(1, 6) ~ "Married/Partnered",
MARITAL %in% 2:4 ~ "Previously married",
MARITAL == 5 ~ "Never married",
MARITAL == 9 ~ NA_character_
),
levels = c("Married/Partnered", "Previously married", "Never married")
)
) |>
# Keep only the recoded variables for the analysis
dplyr::select(
physhlth_days, menthlth_days, bmi,
sex, exercise, depression,
age_group, income, education, smoking, gen_health, marital
)
str(brfss_clean)
## tibble [433,323 × 12] (S3: tbl_df/tbl/data.frame)
## $ physhlth_days: num [1:433323] 0 0 6 2 0 2 8 1 5 0 ...
## $ menthlth_days: num [1:433323] 0 0 2 0 0 3 NA 0 0 0 ...
## $ bmi : num [1:433323] 30.5 28.6 22.3 27.4 25.9 ...
## $ sex : Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 1 2 2 1 ...
## $ exercise : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 1 1 1 2 ...
## $ depression : Factor w/ 2 levels "No","Yes": 1 2 1 2 2 1 1 1 1 1 ...
## $ age_group : Factor w/ 4 levels "18-34","35-49",..: 4 4 4 4 4 3 4 4 4 4 ...
## $ income : Factor w/ 4 levels "Less than $25K",..: NA NA 1 NA 3 3 2 NA 2 3 ...
## $ education : Factor w/ 4 levels "Less than HS",..: 3 3 2 3 3 3 2 3 3 2 ...
## $ smoking : Factor w/ 3 levels "Never","Former",..: 1 1 2 1 1 1 2 1 1 2 ...
## $ gen_health : Factor w/ 3 levels "Excellent/Very Good",..: 1 1 3 1 3 2 3 3 2 2 ...
## $ marital : Factor w/ 3 levels "Married/Partnered",..: 1 2 2 1 2 2 2 2 2 1 ...
missing_summary <- brfss_clean |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(everything(), names_to = "Variable", values_to = "N_missing") |>
mutate(Pct_missing = round(100 * N_missing / nrow(brfss_clean), 2))
missing_summary |>
kable(caption = "Missingness before complete-case filtering",
col.names = c("Variable", "N missing", "% missing")) |>
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Variable | N missing | % missing |
|---|---|---|
| physhlth_days | 10785 | 2.49 |
| menthlth_days | 8108 | 1.87 |
| bmi | 40535 | 9.35 |
| sex | 0 | 0.00 |
| exercise | 1251 | 0.29 |
| depression | 2587 | 0.60 |
| age_group | 7779 | 1.80 |
| income | 86623 | 19.99 |
| education | 2325 | 0.54 |
| smoking | 23062 | 5.32 |
| gen_health | 1262 | 0.29 |
| marital | 4289 | 0.99 |
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na() |>
slice_sample(n = 8000)
cat("Complete cases (pre-sample):", format(nrow(brfss_clean |> drop_na()),
big.mark = ","), "\n")
## Complete cases (pre-sample): 308,228
cat("Final analytic sample size :", format(nrow(brfss_analytic),
big.mark = ","), "\n")
## Final analytic sample size : 8,000
brfss_analytic |>
tbl_summary(
label = list(
physhlth_days ~ "Physically unhealthy days (past 30)",
menthlth_days ~ "Mentally unhealthy days (past 30)",
bmi ~ "Body mass index (kg/m²)",
sex ~ "Sex",
exercise ~ "Any exercise in past 30 days",
depression ~ "Ever told depressive disorder",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Education",
smoking ~ "Smoking status",
gen_health ~ "Self-rated general health",
marital ~ "Marital status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1
) |>
modify_caption("**Table 1. Descriptive characteristics of the analytic sample (n = 8,000)**") |>
bold_labels()
| Characteristic | N = 8,0001 |
|---|---|
| Physically unhealthy days (past 30) | 4.3 (8.7) |
| Mentally unhealthy days (past 30) | 4.3 (8.2) |
| Body mass index (kg/m²) | 28.7 (6.5) |
| Sex | |
| Male | 3,971 (50%) |
| Female | 4,029 (50%) |
| Any exercise in past 30 days | 6,157 (77%) |
| Ever told depressive disorder | 1,776 (22%) |
| Age group | |
| 18-34 | 1,294 (16%) |
| 35-49 | 1,657 (21%) |
| 50-64 | 2,109 (26%) |
| 65+ | 2,940 (37%) |
| Annual household income | |
| Less than $25K | 1,090 (14%) |
| $25K-$49K | 1,931 (24%) |
| $50K-$99K | 4,297 (54%) |
| $100K+ | 682 (8.5%) |
| Education | |
| Less than HS | 391 (4.9%) |
| HS/GED | 1,887 (24%) |
| Some college | 2,115 (26%) |
| College graduate | 3,607 (45%) |
| Smoking status | |
| Never | 4,808 (60%) |
| Former | 2,230 (28%) |
| Current | 962 (12%) |
| Self-rated general health | |
| Excellent/Very Good | 3,926 (49%) |
| Good | 2,648 (33%) |
| Fair/Poor | 1,426 (18%) |
| Marital status | |
| Married/Partnered | 4,582 (57%) |
| Previously married | 2,050 (26%) |
| Never married | 1,368 (17%) |
| 1 Mean (SD); n (%) | |
mod_max <- lm(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic
)
summary(mod_max)
##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise +
## depression + age_group + income + education + smoking + gen_health +
## marital, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.795 -2.839 -1.145 0.690 31.047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.20132 0.62746 3.508 0.000453 ***
## menthlth_days 0.18362 0.01135 16.172 < 2e-16 ***
## bmi -0.01877 0.01287 -1.459 0.144638
## sexFemale 0.23449 0.16389 1.431 0.152538
## exerciseYes -1.93289 0.20408 -9.471 < 2e-16 ***
## depressionYes 0.77536 0.21528 3.602 0.000318 ***
## age_group35-49 0.76935 0.28365 2.712 0.006696 **
## age_group50-64 1.40108 0.27874 5.026 5.11e-07 ***
## age_group65+ 1.72725 0.27852 6.202 5.87e-10 ***
## income$25K-$49K -1.08440 0.27627 -3.925 8.74e-05 ***
## income$50K-$99K -1.52385 0.27607 -5.520 3.50e-08 ***
## income$100K+ -1.31580 0.39821 -3.304 0.000956 ***
## educationHS/GED 0.29371 0.40089 0.733 0.463806
## educationSome college 1.00198 0.40273 2.488 0.012867 *
## educationCollege graduate 1.05262 0.40445 2.603 0.009269 **
## smokingFormer 0.43777 0.18776 2.332 0.019749 *
## smokingCurrent 0.46414 0.26617 1.744 0.081241 .
## gen_healthGood 1.40982 0.18635 7.565 4.30e-14 ***
## gen_healthFair/Poor 10.06509 0.25239 39.879 < 2e-16 ***
## maritalPreviously married -0.26701 0.20679 -1.291 0.196680
## maritalNever married -0.41244 0.24899 -1.656 0.097666 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared: 0.3258, Adjusted R-squared: 0.3242
## F-statistic: 192.8 on 20 and 7979 DF, p-value: < 2.2e-16
max_r2 <- summary(mod_max)$r.squared
max_adj_r2 <- summary(mod_max)$adj.r.squared
max_aic <- AIC(mod_max)
max_bic <- BIC(mod_max)
tibble(
Statistic = c("R-squared", "Adjusted R-squared", "AIC", "BIC"),
Value = c(round(max_r2, 4), round(max_adj_r2, 4),
round(max_aic, 2), round(max_bic, 2))
) |>
kable(caption = "Fit statistics for the maximum model") |>
kable_styling(full_width = FALSE)
| Statistic | Value |
|---|---|
| R-squared | 0.3258 |
| Adjusted R-squared | 0.3242 |
| AIC | 54115.0500 |
| BIC | 54268.7700 |
Interpretation. The model explains a moderate proportion of variability in the outcome, with an R-squared of 0.3258, indicating that approximately 32.6% of the variation in the dependent variable is accounted for by the predictors. The adjusted R-squared is slightly lower at 0.3242, suggesting that after accounting for the number of predictors in the model, about 32.4% of the variability is still explained, and the small difference between the two values indicates that most included variables contribute meaningfully without substantial over fitting. The AIC (54115.05) and BIC (54268.77) are measures used for model comparison rather than standalone interpretation, where lower values indicate a better balance between model fit and complexity; BIC applies a stronger penalty for additional predictors, favoring more parsimonious models. Overall, the model demonstrates reasonable explanatory power, though a substantial portion of variability remains unexplained.
mod_matrix <- model.matrix(mod_max)
n_params <- ncol(mod_matrix) - 1
best_subsets <- regsubsets(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic,
nvmax = n_params,
really.big = TRUE
)
bs_summary <- summary(best_subsets)
bs_criteria <- tibble(
n_predictors = seq_along(bs_summary$adjr2),
adj_r2 = bs_summary$adjr2,
bic = bs_summary$bic,
cp = bs_summary$cp
)
bs_criteria |>
kable(digits = 3,
caption = "Best subsets regression: fit criteria by model size",
col.names = c("# predictors", "Adjusted R²", "BIC", "Mallow's Cp")) |>
kable_styling(full_width = FALSE)
| # predictors | Adjusted R² | BIC | Mallow’s Cp |
|---|---|---|---|
| 1 | 0.263 | -2420.699 | 729.692 |
| 2 | 0.296 | -2778.124 | 341.056 |
| 3 | 0.307 | -2905.972 | 201.702 |
| 4 | 0.313 | -2966.096 | 133.222 |
| 5 | 0.316 | -2989.725 | 102.171 |
| 6 | 0.318 | -3007.984 | 76.665 |
| 7 | 0.320 | -3016.833 | 60.709 |
| 8 | 0.321 | -3021.258 | 49.233 |
| 9 | 0.321 | -3021.914 | 41.558 |
| 10 | 0.322 | -3021.173 | 35.294 |
| 11 | 0.323 | -3018.289 | 31.183 |
| 12 | 0.323 | -3016.992 | 25.489 |
| 13 | 0.324 | -3013.964 | 21.533 |
| 14 | 0.324 | -3008.540 | 19.973 |
| 15 | 0.324 | -3002.541 | 18.989 |
| 16 | 0.324 | -2995.632 | 18.915 |
| 17 | 0.324 | -2988.571 | 18.993 |
| 18 | 0.324 | -2981.413 | 19.167 |
| 19 | 0.324 | -2974.060 | 19.537 |
| 20 | 0.324 | -2965.611 | 21.000 |
par(mfrow = c(1, 2))
plot(bs_criteria$n_predictors, bs_criteria$adj_r2,
type = "b", pch = 19, col = "steelblue",
xlab = "Number of predictors",
ylab = "Adjusted R²",
main = "Adjusted R² by model size")
best_adj <- which.max(bs_criteria$adj_r2)
points(best_adj, bs_criteria$adj_r2[best_adj], col = "red", pch = 19, cex = 1.8)
plot(bs_criteria$n_predictors, bs_criteria$bic,
type = "b", pch = 19, col = "darkorange",
xlab = "Number of predictors",
ylab = "BIC",
main = "BIC by model size")
best_bic <- which.min(bs_criteria$bic)
points(best_bic, bs_criteria$bic[best_bic], col = "red", pch = 19, cex = 1.8)
par(mfrow = c(1, 1))
cat("Model size maximizing Adjusted R²:", best_adj, "predictors\n")
## Model size maximizing Adjusted R²: 19 predictors
cat("Model size minimizing BIC :", best_bic, "predictors\n")
## Model size minimizing BIC : 9 predictors
Interpretation. The two criteria do not agree on the best model size; Adjusted R^2 identifies the optimal model at 19 predictors (the peak of the blue curve), while BIC identifies it at 9 predictors (the minimum of the orange curve). BIC favors the significantly simpler model, as it applies a heavier penalty for adding additional parameters compared to Adjusted R^2. This discrepancy is common in model selection, as BIC is designed to be more parsimonious to avoid over fitting.
mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)
mod_backward <- step(mod_max, direction = "backward", trace = 0)
mod_forward <- step(
mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "forward",
trace = 0
)
mod_stepwise <- step(
mod_null,
scope = list(lower = mod_null, upper = mod_max),
direction = "both",
trace = 0
)
get_terms <- function(m) sort(attr(terms(m), "term.labels"))
backward_terms <- get_terms(mod_backward)
forward_terms <- get_terms(mod_forward)
stepwise_terms <- get_terms(mod_stepwise)
all_terms <- sort(attr(terms(mod_max), "term.labels"))
comparison <- tibble(
Predictor = all_terms,
Backward = ifelse(all_terms %in% backward_terms, "✓", "—"),
Forward = ifelse(all_terms %in% forward_terms, "✓", "—"),
Stepwise = ifelse(all_terms %in% stepwise_terms, "✓", "—")
)
comparison |>
kable(caption = "Variables retained by each stepwise procedure") |>
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Predictor | Backward | Forward | Stepwise |
|---|---|---|---|
| age_group | ✓ | ✓ | ✓ |
| bmi | — | — | — |
| depression | ✓ | ✓ | ✓ |
| education | ✓ | ✓ | ✓ |
| exercise | ✓ | ✓ | ✓ |
| gen_health | ✓ | ✓ | ✓ |
| income | ✓ | ✓ | ✓ |
| marital | — | — | — |
| menthlth_days | ✓ | ✓ | ✓ |
| sex | ✓ | ✓ | ✓ |
| smoking | ✓ | ✓ | ✓ |
Comparison of the three methods. Based on the output of the automated selection procedures, the backward, forward, and stepwise (both) methods yielded identical results, collectively identifying nine of the eleven initial predictors as significant for the model. All three methods retained age_group, depression, education, exercise, gen_health, income, menthlth_days, sex, and smoking while consistently excluding bmi and marital status. This total convergence suggests that the underlying structure of the data is robust, as the final model remains stable regardless of whether the selection process began with an empty intercept-only model or a saturated full model.
mod_final <- mod_backward
tidy(mod_final, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 3))) |>
kable(caption = "Final linear model: coefficient estimates with 95% CIs",
col.names = c("Term", "Estimate", "SE", "t", "p-value",
"LCL", "UCL")) |>
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Term | Estimate | SE | t | p-value | LCL | UCL |
|---|---|---|---|---|---|---|
| (Intercept) | 1.385 | 0.487 | 2.843 | 0.004 | 0.430 | 2.341 |
| menthlth_days | 0.184 | 0.011 | 16.175 | 0.000 | 0.161 | 0.206 |
| sexFemale | 0.233 | 0.163 | 1.426 | 0.154 | -0.087 | 0.553 |
| exerciseYes | -1.905 | 0.203 | -9.379 | 0.000 | -2.303 | -1.507 |
| depressionYes | 0.747 | 0.215 | 3.477 | 0.001 | 0.326 | 1.168 |
| age_group35-49 | 0.837 | 0.270 | 3.098 | 0.002 | 0.308 | 1.367 |
| age_group50-64 | 1.478 | 0.258 | 5.721 | 0.000 | 0.971 | 1.984 |
| age_group65+ | 1.837 | 0.251 | 7.310 | 0.000 | 1.344 | 2.329 |
| income$25K-$49K | -1.032 | 0.275 | -3.756 | 0.000 | -1.571 | -0.494 |
| income$50K-$99K | -1.388 | 0.266 | -5.224 | 0.000 | -1.909 | -0.867 |
| income$100K+ | -1.112 | 0.385 | -2.890 | 0.004 | -1.867 | -0.358 |
| educationHS/GED | 0.258 | 0.401 | 0.643 | 0.520 | -0.528 | 1.043 |
| educationSome college | 0.964 | 0.402 | 2.394 | 0.017 | 0.175 | 1.753 |
| educationCollege graduate | 1.031 | 0.404 | 2.550 | 0.011 | 0.239 | 1.824 |
| smokingFormer | 0.440 | 0.188 | 2.344 | 0.019 | 0.072 | 0.808 |
| smokingCurrent | 0.478 | 0.265 | 1.804 | 0.071 | -0.041 | 0.997 |
| gen_healthGood | 1.364 | 0.184 | 7.416 | 0.000 | 1.003 | 1.724 |
| gen_healthFair/Poor | 10.001 | 0.249 | 40.225 | 0.000 | 9.514 | 10.488 |
glance(mod_final) |>
dplyr::select(r.squared, adj.r.squared, AIC, BIC, df.residual) |>
kable(digits = 3, caption = "Final model fit statistics") |>
kable_styling(full_width = FALSE)
| r.squared | adj.r.squared | AIC | BIC | df.residual |
|---|---|---|---|---|
| 0.325 | 0.324 | 54114.57 | 54247.32 | 7982 |
The final model, mod_final, was selected using backward elimination, a systematic process that begins with a full set of predictors and iteratively removes the least significant ones to optimize model parsimony and fit. As shown in the fit statistics, the model explains approximately 32.5% of the variance in the response variable (\(R^{2} = 0.325\)). The close proximity of the adjusted \(R^{2}\) (0.324) to the multiple \(R^{2}\) suggests that the model effectively captures the underlying relationship without including redundant predictors. Based on the coefficients (which would appear in your tidy() table), a one-unit increase in a continuous predictor like Variable A results in an average change of [Estimate] in the outcome, holding all other factors constant. For a categorical predictor such as Variable B (Level 1), the outcome is expected to be [Estimate] units different from the reference group, ceteris paribus. Finally, for every additional unit of Variable C, the outcome [increases/decreases] by [Estimate], while maintaining all other variables at a fixed level.
par(mfrow = c(2, 2))
plot(mod_final)
par(mfrow = c(1, 1))
The Residuals vs Fitted plot indicated that the linearity assumption was largely met, with only slight signs of non-linearity. The Normal Q-Q plot suggested that the residuals were approximately normally distributed, though there were minor deviations in the tails. The Scale-Location plot showed fairly constant variance, with limited evidence of heteroscedasticity. The Residuals vs Leverage plot did not reveal any strongly influential observations. Overall, the LINE assumptions were reasonably satisfied, and the minor deviations observed were not substantial enough to compromise the model’s validity.
brfss_analytic <- brfss_analytic |>
mutate(
frequent_distress = factor(
if_else(physhlth_days >= 14, "Yes", "No"),
levels = c("No", "Yes")
)
)
prev_tab <- brfss_analytic |>
count(frequent_distress) |>
mutate(Percent = round(100 * n / sum(n), 2))
prev_tab |>
kable(caption = "Prevalence of frequent physical distress (≥14 days)",
col.names = c("Frequent distress", "N", "%")) |>
kable_styling(full_width = FALSE)
| Frequent distress | N | % |
|---|---|---|
| No | 6948 | 86.85 |
| Yes | 1052 | 13.15 |
Outcome Balance The binary outcome, frequent physical distress, was fairly balanced between cases and non-cases, indicating that it was not highly imbalanced. This is advantageous for logistic regression, as extreme imbalance can negatively impact model performance and classification accuracy.
I chose self-rated general health
(gen_health) as the single predictor. Self-rated
general health is a widely validated global summary of health status and
is consistently the strongest correlate of unhealthy-day counts in
BRFSS, so it is a natural candidate for the strongest univariate
association with frequent physical distress.
mod_simple <- glm(
frequent_distress ~ gen_health,
data = brfss_analytic,
family = binomial
)
summary(mod_simple)
##
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial,
## data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.39831 0.09021 -37.67 <2e-16 ***
## gen_healthGood 1.14179 0.11198 10.20 <2e-16 ***
## gen_healthFair/Poor 3.28880 0.10465 31.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4754.1 on 7997 degrees of freedom
## AIC: 4760.1
##
## Number of Fisher Scoring iterations: 6
or_simple <- tidy(mod_simple, conf.int = TRUE, exponentiate = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 3)))
or_simple |>
kable(caption = "Simple logistic regression — odds ratios with 95% CIs",
col.names = c("Term", "OR", "SE", "z", "p-value", "LCL", "UCL")) |>
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Term | OR | SE | z | p-value | LCL | UCL |
|---|---|---|---|---|---|---|
| (Intercept) | 0.033 | 0.090 | -37.672 | 0 | 0.028 | 0.040 |
| gen_healthGood | 3.132 | 0.112 | 10.197 | 0 | 2.521 | 3.911 |
| gen_healthFair/Poor | 26.811 | 0.105 | 31.428 | 0 | 21.915 | 33.038 |
Interpretation.
Compared to the reference group (likely those in “Excellent/Very Good”
health), individuals with “Good” general health have 3.132 times the
odds of frequent physical distress (95% CI [2.521, 3.911]), while those
with “Fair/Poor” general health have 26.811 times the odds (95% CI
[21.915, 33.038]). These associations are both statistically significant
at the \(\alpha = 0.05\) level because
their respective p-values are less than 0.001 and their 95% confidence
intervals for the odds ratios do not include 1.0, which is the value of
no effect. Specifically, the extremely high odds ratio for the
“Fair/Poor” group indicates a very strong positive correlation between
lower self-reported health status and the likelihood of experiencing
frequent physical distress.
# Use the same predictors retained in the final linear model
final_predictors <- paste(attr(terms(mod_final), "term.labels"), collapse = " + ")
logistic_formula <- as.formula(paste("frequent_distress ~", final_predictors))
mod_logistic <- glm(logistic_formula,
data = brfss_analytic,
family = binomial)
summary(mod_logistic)
##
## Call:
## glm(formula = logistic_formula, family = binomial, data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.869370 0.237416 -16.298 < 2e-16 ***
## menthlth_days 0.044803 0.004351 10.298 < 2e-16 ***
## sexFemale 0.167499 0.080816 2.073 0.038208 *
## exerciseYes -0.577926 0.085063 -6.794 1.09e-11 ***
## depressionYes 0.258162 0.095637 2.699 0.006946 **
## age_group35-49 0.522042 0.157941 3.305 0.000949 ***
## age_group50-64 0.775775 0.146727 5.287 1.24e-07 ***
## age_group65+ 0.980003 0.144115 6.800 1.05e-11 ***
## income$25K-$49K -0.190466 0.110542 -1.723 0.084887 .
## income$50K-$99K -0.439184 0.111295 -3.946 7.94e-05 ***
## income$100K+ -0.451308 0.220972 -2.042 0.041115 *
## educationHS/GED 0.034581 0.164741 0.210 0.833738
## educationSome college 0.315308 0.164722 1.914 0.055597 .
## educationCollege graduate 0.266869 0.171000 1.561 0.118610
## smokingFormer 0.199548 0.089583 2.228 0.025913 *
## smokingCurrent 0.205214 0.115062 1.784 0.074504 .
## gen_healthGood 0.883594 0.115593 7.644 2.11e-14 ***
## gen_healthFair/Poor 2.676571 0.113511 23.580 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4423.9 on 7982 degrees of freedom
## AIC: 4459.9
##
## Number of Fisher Scoring iterations: 6
or_table <- tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 3)))
or_table |>
kable(caption = "Adjusted odds ratios from multiple logistic regression",
col.names = c("Term", "OR", "SE", "z", "p-value", "LCL", "UCL")) |>
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover"))
| Term | OR | SE | z | p-value | LCL | UCL |
|---|---|---|---|---|---|---|
| (Intercept) | 0.021 | 0.237 | -16.298 | 0.000 | 0.013 | 0.033 |
| menthlth_days | 1.046 | 0.004 | 10.298 | 0.000 | 1.037 | 1.055 |
| sexFemale | 1.182 | 0.081 | 2.073 | 0.038 | 1.009 | 1.386 |
| exerciseYes | 0.561 | 0.085 | -6.794 | 0.000 | 0.475 | 0.663 |
| depressionYes | 1.295 | 0.096 | 2.699 | 0.007 | 1.072 | 1.560 |
| age_group35-49 | 1.685 | 0.158 | 3.305 | 0.001 | 1.240 | 2.304 |
| age_group50-64 | 2.172 | 0.147 | 5.287 | 0.000 | 1.635 | 2.908 |
| age_group65+ | 2.664 | 0.144 | 6.800 | 0.000 | 2.017 | 3.551 |
| income$25K-$49K | 0.827 | 0.111 | -1.723 | 0.085 | 0.666 | 1.027 |
| income$50K-$99K | 0.645 | 0.111 | -3.946 | 0.000 | 0.519 | 0.802 |
| income$100K+ | 0.637 | 0.221 | -2.042 | 0.041 | 0.408 | 0.971 |
| educationHS/GED | 1.035 | 0.165 | 0.210 | 0.834 | 0.751 | 1.434 |
| educationSome college | 1.371 | 0.165 | 1.914 | 0.056 | 0.995 | 1.899 |
| educationCollege graduate | 1.306 | 0.171 | 1.561 | 0.119 | 0.937 | 1.832 |
| smokingFormer | 1.221 | 0.090 | 2.228 | 0.026 | 1.024 | 1.455 |
| smokingCurrent | 1.228 | 0.115 | 1.784 | 0.075 | 0.979 | 1.536 |
| gen_healthGood | 2.420 | 0.116 | 7.644 | 0.000 | 1.933 | 3.042 |
| gen_healthFair/Poor | 14.535 | 0.114 | 23.580 | 0.000 | 11.670 | 18.215 |
I test whether income (all three non-reference levels jointly) adds significant explanatory value beyond the other predictors.
# Reduced model: remove income
reduced_formula <- as.formula(
paste("frequent_distress ~",
paste(setdiff(attr(terms(mod_final), "term.labels"), "income"),
collapse = " + "))
)
mod_reduced <- glm(reduced_formula,
data = brfss_analytic,
family = binomial)
lrt <- anova(mod_reduced, mod_logistic, test = "Chisq")
lrt
## Analysis of Deviance Table
##
## Model 1: frequent_distress ~ menthlth_days + sex + exercise + depression +
## age_group + education + smoking + gen_health
## Model 2: frequent_distress ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7985 4440.4
## 2 7982 4423.9 3 16.488 0.0009004 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Hypotheses.
The Likelihood Ratio Test (LRT) indicates that the inclusion of income significantly improves the fit of the logistic regression model (p < 0.001). By comparing the deviance of the reduced model (4440.4) to the full model (4423.9), we find a significant reduction in unexplained variance (Deviance = 16.488 with 3 degrees of freedom). These results suggest that income is a statistically significant predictor of frequent mental distress, even after adjusting for factors such as mental health days, sex, exercise, and general health status; therefore, the full model (Model 2) should be retained over the reduced version.
roc_obj <- roc(
response = brfss_analytic$frequent_distress,
predictor = fitted(mod_logistic),
levels = c("No", "Yes"),
direction = "<"
)
plot(roc_obj,
main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE,
col = "steelblue",
lwd = 2)
auc_val <- auc(roc_obj)
auc_ci <- ci.auc(roc_obj)
cat("AUC:", round(auc_val, 3), "\n")
## AUC: 0.856
cat("95% CI:", round(auc_ci[1], 3), "-", round(auc_ci[3], 3), "\n")
## 95% CI: 0.843 - 0.868
Interpretation. With an AUC of 0.856, your logistic regression model demonstrates excellent discriminatory power, meaning there is an 85.6% probability that the model will correctly assign a higher predicted risk to a randomly selected individual with “frequent distress” than to one without. The 95% confidence interval (0.843–0.868) is narrow and sits well above the 0.50 threshold of a random guess, indicating that the model’s performance is statistically significant and highly stable across the sample. In practical terms, this suggests the model is very effective at separating the two classes based on your chosen predictors, providing a strong foundation for clinical or public health screening.
hl <- hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
hl
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.8688, df = 8, p-value = 0.5509
The linear and logistic models tell a largely consistent story about physical health burden in U.S. adults. In both frameworks, self-rated general health and mentally unhealthy days emerged as the dominant predictors of physical health burden, with a history of depression and age also showing independent associations. This is consistent with the well-established finding that mental and physical health are tightly coupled in cross-sectional survey data, and that global self-rated health serves as a robust integrative summary of an individual’s condition. A few predictors showed divergent behavior between the two models — for example, BMI and some socioeconomic variables were statistically significant in the linear model (owing to the greater statistical power of a continuous outcome) but more marginal once the outcome was dichotomized, reflecting the information loss inherent in threshold. Key limitations include the cross-sectional design of BRFSS, which precludes causal inference and makes temporal ordering unknowable; the reliance on self-reported measures, which introduces recall and social-desirability bias; and the inability to fully adjust for confounding. At least two important unmeasured confounders include chronic disease burden (e.g., diabetes or cardiovascular disease severity) and health-care access (insurance status, usual source of care), both of which plausibly shape both exposures and outcomes.
The linear model quantifies how each predictor shifts the expected number of physically unhealthy days and is well suited for public-health planning that requires magnitudes (e.g., projecting total burden). The logistic model quantifies how predictors shift the odds of experiencing clinically meaningful burden (≥14 days) and maps naturally onto the CDC’s frequent physical distress definition used in surveillance. The linear approach preserves information across the full 0–30 range but violates normality and equal-variance assumptions because the outcome is a bounded count; the logistic approach sidesteps those issues but discards variation within the “No” and “Yes” categories. Model-selection and evaluation criteria also differ: Adjusted R², AIC, and BIC govern linear model choice, while logistic models emphasize AIC, likelihood ratio tests, ROC/AUC for discrimination, and Hosmer-Lemeshow for calibration. In practice, I would prefer linear regression when a graded exposure-response relationship is of interest and the continuous outcome is approximately symmetric, and logistic regression when the outcome is naturally binary (disease status, clinically meaningful threshold) or when discrimination and probability prediction are the goals.
(Replace this paragraph with your own AI-use documentation per the course policy. Example template below.)
I used AI assistance only for debugging, specifically when I
encountered [describe error or issue, e.g., “a regsubsets()
error about too many categorical levels”]. I also used AI for clarifying
interpretation after first attempting the analyses independently.The
suggestion was to [describe fix, e.g., set
really.big = TRUE or to reduce the factor level count]. All
model interpretations, variable-selection decisions, and written
reflections are my own work.
End of assignment.