# Load necessary libraries
library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.5.2
library(kableExtra)
library(car)
library(broom)
library(haven)
library(dplyr)
library(janitor)
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(gt)
## Warning: package 'gt' was built under R version 4.5.2
library(ggeffects)
## Warning: package 'ggeffects' was built under R version 4.5.2
library(ggplot2)
brfss_2023 <- readRDS("brfss_2023.rds")
brfss_2023 <- brfss_2023 %>%
clean_names()
The raw dataset, brfss_2023 has 433,323 observations and 350 variables.
brfss_clean <- brfss_2023 |>
mutate(
# Mental health days
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth %in% c(77, 99) ~ NA_real_,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
# Physical unhealth days
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth %in% c(77, 99) ~ NA_real_,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# BMI
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
TRUE ~ as.numeric(bmi5) / 100
),
# Sex
sex = factor(case_when(
sexvar == 1 ~ "Male",
sexvar == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")),
# Exercise
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
exerany2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No","Yes")),
# Age group
age_group = factor(case_when(
ageg5yr == 1 ~ "18-24",
ageg5yr == 2 ~ "25-29",
ageg5yr == 3 ~ "30-34",
ageg5yr == 4 ~ "35-39",
ageg5yr == 5 ~ "40-44",
ageg5yr == 6 ~ "45-49",
ageg5yr == 7 ~ "50-54",
ageg5yr == 8 ~ "55-59",
ageg5yr == 9 ~ "60-64",
ageg5yr == 10 ~ "65-69",
ageg5yr == 11 ~ "70-74",
ageg5yr == 12 ~ "75-79",
ageg5yr == 13 ~ "80+",
ageg5yr == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels=c("18-24",
"25-29",
"30-34",
"35-39",
"40-44",
"45-49",
"50-54",
"55-59",
"60-64",
"65-69",
"70-74",
"75-79",
"80+")),
# Income
income = factor(case_when(
incomg1 == 1 ~ "Less than $15,000",
incomg1 == 2 ~ "$15,000-$24,999",
incomg1 == 3 ~ "$25,000-$34,999",
incomg1 == 4 ~ "$35,000-$49,999",
incomg1 == 5 ~ "$50,000-$99,999",
incomg1 == 6 ~ "$100,000-$199,999",
incomg1 == 7 ~ "$200,000 or more",
incomg1 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $15,000",
"$15,000-$24,999",
"$25,000-$34,999",
"$35,000-$49,999",
"$50,000-$99,999",
"$100,000-$199,999",
"$200,000 or more")),
# Education
education = factor(case_when(
educa %in% c(1, 2) ~ "Less than high school",
educa == 3 ~ "High school diploma or GED",
educa == 4 ~ "Some college or technical school",
educa == 5 ~ "College graduate",
educa == 6 ~ "Graduate or professional degree",
educa == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than high school",
"High school diploma or GED",
"Some college or technical school",
"College graduate",
"Graduate or professional degree")),
# Smoking
smoking = factor(case_when(
smoker3 == 1 ~ "Current daily smoker",
smoker3 == 2 ~ "Current some-day smoker",
smoker3 == 3 ~ "Former smoker",
smoker3 == 4 ~ "Never smoker",
smoker3 == 9 ~ NA_character_
),
levels = c("Current daily smoker",
"Current some-day smoker",
"Former smoker","Never smoker"))
) |>
select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking)
cat("Total N:", nrow(brfss_clean), "\n")
## Total N: 433323
# Missing values for key variables
cat("Missing menthlth_days:", sum(is.na(brfss_clean$menthlth_days)),
"(", round(mean(is.na(brfss_clean$menthlth_days)) * 100, 1), "%)\n")
## Missing menthlth_days: 8108 ( 1.9 %)
cat("Missing physhlth_days:", sum(is.na(brfss_clean$physhlth_days)),
"(", round(mean(is.na(brfss_clean$physhlth_days)) * 100, 1), "%)\n")
## Missing physhlth_days: 10785 ( 2.5 %)
cat("Missing bmi:", sum(is.na(brfss_clean$bmi)),
"(", round(mean(is.na(brfss_clean$bmi)) * 100, 1), "%)\n")
## Missing bmi: 40535 ( 9.4 %)
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
saveRDS(brfss_analytic, "brfss_analytic.rds")
brfss_analytic |>
select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking) |>
tbl_summary(
by = sex,
label = list(
menthlth_days ~ "Mentally Unhealthy Days (past 30)",
physhlth_days ~ "Physically Unhealthy Days (past 30)",
bmi ~ "BMI",
exercise ~ "Any Physical Activity (past 30 days)",
age_group ~ "Age Category (years)",
income ~ "Income Category",
education ~ "Education Level",
smoking ~ "Smoking Status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
bold_labels() |>
italicize_levels() |>
modify_caption("**Descriptive Statistics — BRFSS 2023, Stratified by Sex**") |>
as_gt()
| Characteristic | N | Male N = 3,9361 |
Female N = 4,0641 |
|---|---|---|---|
| Mentally Unhealthy Days (past 30) | 8,000 | 3.5 (7.5) | 5.1 (8.6) |
| Physically Unhealthy Days (past 30) | 8,000 | 4.0 (8.4) | 4.9 (8.9) |
| BMI | 8,000 | 28.7 (6.0) | 28.7 (7.0) |
| Any Physical Activity (past 30 days) | 8,000 | 3,146 (80%) | 3,094 (76%) |
| Age Category (years) | 8,000 | ||
| 18-24 | 235 (6.0%) | 171 (4.2%) | |
| 25-29 | 219 (5.6%) | 189 (4.7%) | |
| 30-34 | 253 (6.4%) | 210 (5.2%) | |
| 35-39 | 263 (6.7%) | 302 (7.4%) | |
| 40-44 | 290 (7.4%) | 292 (7.2%) | |
| 45-49 | 266 (6.8%) | 252 (6.2%) | |
| 50-54 | 305 (7.7%) | 303 (7.5%) | |
| 55-59 | 308 (7.8%) | 352 (8.7%) | |
| 60-64 | 408 (10%) | 379 (9.3%) | |
| 65-69 | 418 (11%) | 483 (12%) | |
| 70-74 | 382 (9.7%) | 426 (10%) | |
| 75-79 | 325 (8.3%) | 338 (8.3%) | |
| 80+ | 264 (6.7%) | 367 (9.0%) | |
| Income Category | 8,000 | ||
| Less than $15,000 | 160 (4.1%) | 247 (6.1%) | |
| $15,000-$24,999 | 271 (6.9%) | 370 (9.1%) | |
| $25,000-$34,999 | 376 (9.6%) | 495 (12%) | |
| $35,000-$49,999 | 482 (12%) | 585 (14%) | |
| $50,000-$99,999 | 1,251 (32%) | 1,260 (31%) | |
| $100,000-$199,999 | 996 (25%) | 869 (21%) | |
| $200,000 or more | 400 (10%) | 238 (5.9%) | |
| Education Level | 8,000 | ||
| Less than high school | 75 (1.9%) | 49 (1.2%) | |
| High school diploma or GED | 130 (3.3%) | 122 (3.0%) | |
| Some college or technical school | 950 (24%) | 877 (22%) | |
| College graduate | 1,018 (26%) | 1,120 (28%) | |
| Graduate or professional degree | 1,763 (45%) | 1,896 (47%) | |
| Smoking Status | 8,000 | ||
| Current daily smoker | 339 (8.6%) | 319 (7.8%) | |
| Current some-day smoker | 151 (3.8%) | 117 (2.9%) | |
| Former smoker | 1,207 (31%) | 1,055 (26%) | |
| Never smoker | 2,239 (57%) | 2,573 (63%) | |
| 1 Mean (SD); n (%) | |||
nrow(brfss_analytic)
## [1] 8000
The final analytic sample is 8000 observations with 9 variables after cleaning the dataset, recoding missing variables, and dropping any missing values.
Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?
# Full MLR Model: Unadjusted
m_full <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)
# Summary
summary(m_full)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65053 0.95407 10.115 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000-$24,999 -1.63703 0.46899 -3.491 0.000485
## income$25,000-$34,999 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000-$49,999 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000-$99,999 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000-$199,999 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000-$24,999 ***
## income$25,000-$34,999 ***
## income$35,000-$49,999 ***
## income$50,000-$99,999 ***
## income$100,000-$199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
tidy(m_full, conf.int = TRUE) %>%
mutate(
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table 1. Multiple Linear Regression: Full Model (BRFSS 2023, n = 8,000)",
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
format = "html"
) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = TRUE) %>%
row_spec(0, bold = TRUE) %>%
row_spec(c(2, 3), background = "#EBF5FB")
| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 9.6505 | 0.9541 | 10.1151 | 0.0000 | 7.7803 | 11.5208 |
| physhlth_days | 0.2656 | 0.0101 | 26.3841 | 0.0000 | 0.2459 | 0.2853 |
| bmi | 0.0334 | 0.0132 | 2.5274 | 0.0115 | 0.0075 | 0.0593 |
| sexFemale | 1.3904 | 0.1675 | 8.3007 | 0.0000 | 1.0620 | 1.7187 |
| exerciseYes | -0.6512 | 0.2147 | -3.0326 | 0.0024 | -1.0721 | -0.2303 |
| age_group25-29 | -1.0566 | 0.5194 | -2.0343 | 0.0420 | -2.0747 | -0.0385 |
| age_group30-34 | -1.0939 | 0.5065 | -2.1600 | 0.0308 | -2.0867 | -0.1012 |
| age_group35-39 | -1.8110 | 0.4885 | -3.7072 | 0.0002 | -2.7686 | -0.8534 |
| age_group40-44 | -2.8931 | 0.4875 | -5.9346 | 0.0000 | -3.8487 | -1.9375 |
| age_group45-49 | -3.0562 | 0.4977 | -6.1408 | 0.0000 | -4.0318 | -2.0806 |
| age_group50-54 | -3.5167 | 0.4832 | -7.2775 | 0.0000 | -4.4640 | -2.5695 |
| age_group55-59 | -4.4960 | 0.4755 | -9.4543 | 0.0000 | -5.4282 | -3.5638 |
| age_group60-64 | -4.5221 | 0.4585 | -9.8633 | 0.0000 | -5.4209 | -3.6234 |
| age_group65-69 | -5.5779 | 0.4502 | -12.3903 | 0.0000 | -6.4604 | -4.6955 |
| age_group70-74 | -6.0254 | 0.4574 | -13.1728 | 0.0000 | -6.9220 | -5.1287 |
| age_group75-79 | -6.2866 | 0.4748 | -13.2392 | 0.0000 | -7.2174 | -5.3557 |
| age_group80+ | -6.8197 | 0.4768 | -14.3019 | 0.0000 | -7.7544 | -5.8850 |
| income$15,000-$24,999 | -1.6370 | 0.4690 | -3.4905 | 0.0005 | -2.5564 | -0.7177 |
| income$25,000-$34,999 | -2.0764 | 0.4480 | -4.6351 | 0.0000 | -2.9545 | -1.1982 |
| income$35,000-$49,999 | -2.5469 | 0.4382 | -5.8122 | 0.0000 | -3.4058 | -1.6879 |
| income$50,000-$99,999 | -3.0505 | 0.4107 | -7.4277 | 0.0000 | -3.8555 | -2.2454 |
| income$100,000-$199,999 | -3.4998 | 0.4292 | -8.1537 | 0.0000 | -4.3413 | -2.6584 |
| income$200,000 or more | -3.7841 | 0.5004 | -7.5628 | 0.0000 | -4.7649 | -2.8033 |
| educationHigh school diploma or GED | 0.0799 | 0.8107 | 0.0986 | 0.9215 | -1.5092 | 1.6690 |
| educationSome college or technical school | 1.0768 | 0.6897 | 1.5612 | 0.1185 | -0.2753 | 2.4288 |
| educationCollege graduate | 1.7909 | 0.6912 | 2.5911 | 0.0096 | 0.4360 | 3.1458 |
| educationGraduate or professional degree | 1.7378 | 0.6925 | 2.5095 | 0.0121 | 0.3803 | 3.0953 |
| smokingCurrent some-day smoker | -1.5867 | 0.5352 | -2.9645 | 0.0030 | -2.6359 | -0.5375 |
| smokingFormer smoker | -1.9897 | 0.3371 | -5.9020 | 0.0000 | -2.6506 | -1.3289 |
| smokingNever smoker | -2.9368 | 0.3216 | -9.1313 | 0.0000 | -3.5673 | -2.3063 |
menthlth_days = 9.6505 + (0.266 * physhlth_days) + (0.033 * bmi) + (1.390 * sexFemale) - (0.651 * exerciseYes) - (1.057 * age_group25-29) - (1.094 * age_group30-34) - (1.811 * age_group35-39) - (2.893 * age_group40-44) - (3.056 * age_group45-49) - (3.517 * age_group50-54) - (4.496 * age_group55-59) + - (4.522 * age_group60-64) - (5.578 * age_group65_69) - (6.025 * age_group70-74) - (6.287 * age_group75-79) - (6.820 * age_group80+) - (1.637 * income_15,000−24,999) - (2.076 * income_25,000-34,999) - (2.547 * income_35,000-49,999) - (3.051 * income_50,000-99,999) - (3.500 * income_100,000-199,999) - (3.784 * income$200,000 or more) + (0.080 * educationHigh school diploma or GED) + (1.077 * educationSome college or technical school) + (1.791 * educationCollege graduate) + (1.738 * educationGraduate or professional degree) - (1.587 * smokingCurrent some-day smoker) - (1.990 * smokingFormer smoker) - (2.937 * smokingNever smoker)
physhlth_days (continuous predictor): For each one unit increase in physically unhealthy day, the mentally unhealthy days increases by 0.266 units, holding all other predictors constant. This is the strongest continuous predictor and suggests that worse physical health is linked to worse mental health.
bmi (continuous predictor): For each one unit increase in BMI is associated with roughly 0.033 more mentally unhealthy days, holding all other predictors constant. Generally, this means that higher body weight is modestly linked to poorer mental health.
sex: Female vs. Male (reference): Females report about 1.39 more mentally unhealthy days than males. This indicates females experiences slightly more poor mental health days than males, holding all other factors constant.
exercise: Yes vs. No (reference): People who exercise report 0.651 fewer mentally unhealthy days than those who do not. This shows exercise is associated with modestly better mental health, holding all other factors constant.
Age group, 40-45: Adults aged 40-45 report 2.893 fewer mentally unhealthy days in the past 30 days compared to young adults (18–24). This shows that those who are middle-aged adults appear to have better mental health compared to younger adults, holding all other factors constant.
Income, $50,000-$99,999: People earning $50,000–$99,000 report about 3.051 fewer mentally unhealthy days than those earning less than $15,000. Higher income is linked to better mental health, holding all other factors constant.
Income, $200,00: Those earning $200,000 or more report 3.784 fewer mentally unhealthy days compared to the lowest income group. This means that very high income is associated with the fewest mentally unhealthy days, holding all other factors constant.
# Extract model-level statistics
mod_stats <- glance(m_full)
# Create a tidy table
tibble(
Statistic = c(
"R-squared",
"Adjusted R-squared",
"Root MSE (Residual Standard Error)",
"Overall F-statistic",
"Numerator df (p)",
"Denominator df (n - p - 1)",
"Overall p-value"
),
Value = c(
round(mod_stats$r.squared, 4),
round(mod_stats$adj.r.squared, 4),
round(mod_stats$sigma, 3),
round(mod_stats$statistic, 2),
mod_stats$df, # numerator df
mod_stats$df.residual, # denominator df
format.pval(mod_stats$p.value, digits = 3)
)
) |>
kable(caption = "Table 1.1. Model-Level Summary Statistics — BRFSS 2023") |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
| Statistic | Value |
|---|---|
| R-squared | 0.1853 |
| Adjusted R-squared | 0.1824 |
| Root MSE (Residual Standard Error) | 7.352 |
| Overall F-statistic | 62.52 |
| Numerator df (p) | 29 |
| Denominator df (n - p - 1) | 7970 |
| Overall p-value | <2e-16 |
R² = 0.1853: About 18.5% of the variation in mentally unhealthy days is explained by the predictors physhlth_days, bmi, sex, exercise, age_group, income, education, smoking in the model.
Adjusted R² = 0.1824: The adjusted R² is slightly lower than the unadjusted R², meaning that the number of predictors in the model may contain some variables that do not substantially improve its explanatory power. Because adjusted R² penalizes the addition of unnecessary predictors, this suggests that while the model explains about 18.24% of the variance in the outcome, some predictors may be contributing little and the model could potentially be simplified without losing much explanatory value.
Root MSE = 7.352: This indicates that the model’s predicted number of mentally unhealthy days typically differs from the actual reported value by about 7.35 days. In practical terms, this suggests that while the model captures some patterns in the data, there is still a considerable amount of prediction error for individual respondents. This level of error indicates moderate predictive accuracy and implies that other factors not included in the model may also influence mentally unhealthy days.
F-Test Results: H₀ (Null Hypothesis): All regression coefficients = 0. The F-statistic = 62.52 and overall p-value = <2e-16, we are going to reject H₀, indicating that the model is statisticallt significant and at least one predictor contributes significantly to explaining variation in mentally unhealthy days.
anova_3 <- Anova(m_full, type = "III")
anova_3 |>
tidy() |>
mutate(
across(where(is.numeric), \(x) round(x, 4)),
`Significant (alpha = 0.05)` = ifelse(p.value < 0.05, "Yes", "No")
) |>
kable(
caption = "Table 2: Type III Partial Sum of Squares - Testing All Predictor's Contribution",
col.names = c("Predictor","Sum of Sq","df","F value","p-value","Significant?")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
column_spec(6, bold = TRUE)
| Predictor | Sum of Sq | df | F value | p-value | Significant? |
|---|---|---|---|---|---|
| (Intercept) | 5529.7722 | 1 | 102.3152 | 0.0000 | Yes |
| physhlth_days | 37622.7952 | 1 | 696.1198 | 0.0000 | Yes |
| bmi | 345.2408 | 1 | 6.3879 | 0.0115 | Yes |
| sex | 3723.8662 | 1 | 68.9012 | 0.0000 | Yes |
| exercise | 497.0434 | 1 | 9.1966 | 0.0024 | Yes |
| age_group | 30092.1774 | 12 | 46.3986 | 0.0000 | Yes |
| income | 4733.8943 | 6 | 14.5982 | 0.0000 | Yes |
| education | 1265.1504 | 4 | 5.8521 | 0.0001 | Yes |
| smoking | 5101.1076 | 3 | 31.4613 | 0.0000 | Yes |
| Residuals | 430750.0872 | 7970 | NA | NA | NA |
Based on the Type III (Partial) Sums of Squares, all of the predictors have statistically significant partial associations at α = 0.05. Physically unhealthy days (physhlth_days) is the strongest predictor — as it has the largest sum of squares and the largest F-statistic. This means that it explains the most variation in mentally unhealthy days after controlling for everything else.
# Reduced model (remove income)
m_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking, data = brfss_analytic)
# Compare models
anova_no_income <- anova(m_no_income, m_full) |>
as.data.frame() |>
rownames_to_column("Model") |>
mutate(
Model = c("Reduced (physhlth_days + bmi + sex + exercise + age_group + education + smoking)", "Full Model"),
across(where(is.numeric), ~ round(., 4))
) |>
kable(
caption = "Table 2.1. Chunk Test: Comparison Between Reduced Model Without Income and Full Model",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
anova_no_income
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (physhlth_days + bmi + sex + exercise + age_group + education + smoking) | 7976 | 435484.0 | NA | NA | NA | NA |
| Full Model | 7970 | 430750.1 | 6 | 4733.894 | 14.5982 | 0 |
Null Hypothesis (H₀): All income coefficients = 0, and income, as a group, is not associated with mentally unhealthy days and does not improve the model.
Alternative Hypothesis (H₁): At least one income coefficient ≠ 0, and income, as a group, is associated with mentally unhealthy days and improves the model.
Test Results: F-statistic: F(6, 7970) = 14.5982 Degrees of freedom: numerator = 6, denominator = 7970 p-value: 0
Conclusion: Since the p-value = 0 which is less than 0.05, we reject the null hypothesis. Income, as a group, significantly improves the model and is associated with mentally unhealthy days after adjusting for all other predictors.
# Reduced model (remove education)
m_no_edu <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking, data = brfss_analytic)
# Compare models
anova_no_edu <- anova(m_no_edu, m_full) |>
as.data.frame() |>
rownames_to_column("Model") |>
mutate(
Model = c("Reduced (physhlth_days + bmi + sex + exercise + age_group + income + smoking)", "Full Model"),
across(where(is.numeric), ~ round(., 4))
) |>
kable(
caption = "Table 2.2. Chunk Test: Comparison Between Reduced Model Without Education and Full Model",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
anova_no_edu
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (physhlth_days + bmi + sex + exercise + age_group + income + smoking) | 7974 | 432015.2 | NA | NA | NA | NA |
| Full Model | 7970 | 430750.1 | 4 | 1265.15 | 5.8521 | 1e-04 |
Null Hypothesis (H₀): All education coefficients = 0, and education, as a group, is not associated with mentally unhealthy days and does not improve the model,
Alternative Hypothesis (H₁): At least one education coefficient ≠ 0, and education, as a group, is associated with mentally unhealthy days and improves the model.
Test Results: F-statistic: F(4, 7970) = 5.8521 Degrees of freedom: numerator = 6, denominator = 7970 p-value: 0.0001
Conclusion: Since the p-value = 0 which is less than 0.05, we reject the null hypothesis. Education, as a group, significantly improves the model and is associated with mentally unhealthy days after adjusting for all other predictors.
The Type III results indicate that several predictors including physically unhealthy days, income, education, and smoking, make significant independent contributions to explaining the variation in mentally unhealthy days after adjusting for all other variables in the model. Among these, physically unhealthy days seems to be the strongest contributor, due to having the highest sum of squares and the highest F-statistic. The chunk tests further show that both income and education, as groups, significantly improve model fit. This highlights that the overall contribution of a categorical variable can be important even when individual levels are not. Compared to individual t-tests, chunk tests evaluate the joint effect of all categories within a variable, providing a more comprehensive assessment of its overall importance in the model.
brfss_analytic <- brfss_analytic %>%
mutate(
smoker_current = case_when(
smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
),
smoker_current = factor(smoker_current, levels = c("Non-smoker", "Current smoker"))
)
brfss_analytic %>%
select(smoker_current) %>%
tbl_summary(
label = list(smoker_current ~ "Current Smoking Status"),
statistic = all_categorical() ~ "{n} ({p}%)",
missing = "no"
) %>%
add_n() %>%
bold_labels() %>%
italicize_levels() %>%
modify_caption("Table 3: Current Smoking Status")
| Characteristic | N | N = 8,0001 |
|---|---|---|
| Current Smoking Status | 8,000 | |
| Non-smoker | 7,074 (88%) | |
| Current smoker | 926 (12%) | |
| 1 n (%) | ||
# Model A:
ModelA <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education, data = brfss_analytic)
tidy(ModelA, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.1. Model A — Main Effects Only",
col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.7553 | 0.9271 | 7.2869 | 0.0000 | 4.9381 | 8.5726 |
| physhlth_days | 0.2686 | 0.0101 | 26.6731 | 0.0000 | 0.2489 | 0.2884 |
| bmi | 0.0334 | 0.0132 | 2.5250 | 0.0116 | 0.0075 | 0.0593 |
| sexFemale | 1.3331 | 0.1673 | 7.9675 | 0.0000 | 1.0051 | 1.6611 |
| smoker_currentCurrent smoker | 2.1287 | 0.2712 | 7.8489 | 0.0000 | 1.5971 | 2.6604 |
| exerciseYes | -0.6725 | 0.2150 | -3.1274 | 0.0018 | -1.0940 | -0.2510 |
| age_group25-29 | -0.9149 | 0.5198 | -1.7602 | 0.0784 | -1.9338 | 0.1040 |
| age_group30-34 | -0.8823 | 0.5061 | -1.7434 | 0.0813 | -1.8743 | 0.1097 |
| age_group35-39 | -1.5810 | 0.4877 | -3.2421 | 0.0012 | -2.5369 | -0.6251 |
| age_group40-44 | -2.6157 | 0.4858 | -5.3847 | 0.0000 | -3.5680 | -1.6635 |
| age_group45-49 | -2.8246 | 0.4970 | -5.6836 | 0.0000 | -3.7988 | -1.8504 |
| age_group50-54 | -3.2600 | 0.4821 | -6.7628 | 0.0000 | -4.2050 | -2.3151 |
| age_group55-59 | -4.2301 | 0.4741 | -8.9219 | 0.0000 | -5.1595 | -3.3007 |
| age_group60-64 | -4.2484 | 0.4568 | -9.3000 | 0.0000 | -5.1439 | -3.3529 |
| age_group65-69 | -5.2338 | 0.4467 | -11.7163 | 0.0000 | -6.1095 | -4.3582 |
| age_group70-74 | -5.7023 | 0.4545 | -12.5457 | 0.0000 | -6.5933 | -4.8113 |
| age_group75-79 | -5.8977 | 0.4703 | -12.5401 | 0.0000 | -6.8197 | -4.9758 |
| age_group80+ | -6.4888 | 0.4737 | -13.6975 | 0.0000 | -7.4174 | -5.5602 |
| income$15,000-$24,999 | -1.6797 | 0.4698 | -3.5754 | 0.0004 | -2.6007 | -0.7588 |
| income$25,000-$34,999 | -2.1023 | 0.4486 | -4.6861 | 0.0000 | -2.9817 | -1.2229 |
| income$35,000-$49,999 | -2.5869 | 0.4390 | -5.8931 | 0.0000 | -3.4474 | -1.7264 |
| income$50,000-$99,999 | -3.0823 | 0.4114 | -7.4921 | 0.0000 | -3.8887 | -2.2758 |
| income$100,000-$199,999 | -3.5360 | 0.4300 | -8.2232 | 0.0000 | -4.3789 | -2.6931 |
| income$200,000 or more | -3.8625 | 0.5011 | -7.7078 | 0.0000 | -4.8448 | -2.8802 |
| educationHigh school diploma or GED | 0.2139 | 0.8119 | 0.2634 | 0.7922 | -1.3776 | 1.8053 |
| educationSome college or technical school | 1.1965 | 0.6907 | 1.7323 | 0.0833 | -0.1575 | 2.5505 |
| educationCollege graduate | 1.9035 | 0.6922 | 2.7499 | 0.0060 | 0.5466 | 3.2604 |
| educationGraduate or professional degree | 1.7456 | 0.6938 | 2.5160 | 0.0119 | 0.3856 | 3.1057 |
# Model B:
ModelB <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education, data = brfss_analytic)
tidy(ModelB, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.2. Model B, With Interaction (sex * smoker_current)",
col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.8994 | 0.9286 | 7.4301 | 0.0000 | 5.0791 | 8.7196 |
| physhlth_days | 0.2686 | 0.0101 | 26.6788 | 0.0000 | 0.2489 | 0.2883 |
| bmi | 0.0331 | 0.0132 | 2.5017 | 0.0124 | 0.0072 | 0.0590 |
| sexFemale | 1.1855 | 0.1775 | 6.6784 | 0.0000 | 0.8376 | 1.5335 |
| smoker_currentCurrent smoker | 1.5208 | 0.3654 | 4.1621 | 0.0000 | 0.8045 | 2.2371 |
| exerciseYes | -0.6728 | 0.2150 | -3.1301 | 0.0018 | -1.0942 | -0.2515 |
| age_group25-29 | -0.9202 | 0.5196 | -1.7709 | 0.0766 | -1.9388 | 0.0984 |
| age_group30-34 | -0.8924 | 0.5059 | -1.7640 | 0.0778 | -1.8841 | 0.0993 |
| age_group35-39 | -1.5929 | 0.4875 | -3.2673 | 0.0011 | -2.5485 | -0.6372 |
| age_group40-44 | -2.6286 | 0.4856 | -5.4127 | 0.0000 | -3.5806 | -1.6766 |
| age_group45-49 | -2.8425 | 0.4969 | -5.7209 | 0.0000 | -3.8165 | -1.8686 |
| age_group50-54 | -3.2778 | 0.4820 | -6.8012 | 0.0000 | -4.2226 | -2.3331 |
| age_group55-59 | -4.2499 | 0.4740 | -8.9652 | 0.0000 | -5.1791 | -3.3206 |
| age_group60-64 | -4.2640 | 0.4567 | -9.3364 | 0.0000 | -5.1593 | -3.3688 |
| age_group65-69 | -5.2506 | 0.4466 | -11.7563 | 0.0000 | -6.1261 | -4.3751 |
| age_group70-74 | -5.7111 | 0.4544 | -12.5686 | 0.0000 | -6.6018 | -4.8203 |
| age_group75-79 | -5.9076 | 0.4702 | -12.5646 | 0.0000 | -6.8292 | -4.9859 |
| age_group80+ | -6.4995 | 0.4736 | -13.7239 | 0.0000 | -7.4278 | -5.5711 |
| income$15,000-$24,999 | -1.6357 | 0.4700 | -3.4804 | 0.0005 | -2.5570 | -0.7144 |
| income$25,000-$34,999 | -2.0746 | 0.4486 | -4.6243 | 0.0000 | -2.9540 | -1.1952 |
| income$35,000-$49,999 | -2.5455 | 0.4392 | -5.7964 | 0.0000 | -3.4064 | -1.6847 |
| income$50,000-$99,999 | -3.0430 | 0.4116 | -7.3935 | 0.0000 | -3.8498 | -2.2362 |
| income$100,000-$199,999 | -3.5097 | 0.4300 | -8.1623 | 0.0000 | -4.3526 | -2.6668 |
| income$200,000 or more | -3.8405 | 0.5010 | -7.6651 | 0.0000 | -4.8226 | -2.8583 |
| educationHigh school diploma or GED | 0.1256 | 0.8124 | 0.1546 | 0.8772 | -1.4669 | 1.7180 |
| educationSome college or technical school | 1.1179 | 0.6912 | 1.6172 | 0.1059 | -0.2371 | 2.4729 |
| educationCollege graduate | 1.8179 | 0.6928 | 2.6239 | 0.0087 | 0.4598 | 3.1760 |
| educationGraduate or professional degree | 1.6691 | 0.6943 | 2.4040 | 0.0162 | 0.3081 | 3.0300 |
| sexFemale:smoker_currentCurrent smoker | 1.2833 | 0.5171 | 2.4819 | 0.0131 | 0.2697 | 2.2968 |
anova_interaction <- anova(ModelA, ModelB)
anova_interaction
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7972 432509
## 2 7971 432175 1 333.97 6.1598 0.01309 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_interaction |>
tidy() |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.3. Comparison Between Model A (no interaction) vs. Model B (with interaction)",
col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education | 7972 | 432508.9 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education | 7971 | 432174.9 | 1 | 333.9749 | 6.1598 | 0.0131 |
Null Hypothesis (H₀): The interaction between sex and current smoking does not improve the model; β for the interaction = 0.
Alternative Hypothesis (H₁): The interaction between sex and current smoking significantly improves the model; β ≠ 0.
Conclusion at α = 0.05: F(1, 7971) = 6.16, p = 0.013 From these results and the p-value < 0.05, we are going to reject the null hypothesis. The interaction between sex and current smoking status is statistically significant; this means the effect of smoking on mentally unhealthy days differs by sex.
# Get coefficients from Model B
Coef_B <- coef(ModelB)
# Effect of current smoking among men (reference group)
smoking_men <- Coef_B["smoker_currentCurrent smoker"]
# Effect of current smoking among women (add interaction term)
smoking_women <- smoking_men + Coef_B["sexFemale:smoker_currentCurrent smoker"]
# Difference between women and men
diff_sex <- smoking_women - smoking_men
# Print results
cat("Estimated association of current smoking with mentally unhealthy days:\n")
## Estimated association of current smoking with mentally unhealthy days:
cat(" Among men :", round(smoking_men, 3), "days\n")
## Among men : 1.521 days
cat(" Among women :", round(smoking_women, 3), "days\n")
## Among women : 2.804 days
cat(" Difference :", round(diff_sex, 3), "days\n")
## Difference : 1.283 days
Interpretation: Current smoking is associated with 1.52 more mentally unhealthy days per month for men (reference group). Among women, smoking is associated with 2.80 more days, which is about 1.28 days higher than for men, indicating a stronger association in women.
pred_int <- ggpredict(ModelB, terms = c("smoker_current", "sex"))
ggplot(pred_int, aes(x = x, y = predicted, color = group, fill = group, group = group)) +
geom_point(size = 4, position = position_dodge(width = 0.3)) +
geom_line(linewidth = 1.2, position = position_dodge(width = 0.3)) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, position = position_dodge(width = 0.3)) +
labs(
title = "Predicted Mentally Unhealthy Days by Sex and Current Smoking Status",
subtitle = "Model B Interaction: sex * smoker_current",
x = "Smoking Status",
y = "Predicted Mentally Unhealthy Days (past 30 days)",
color = "Sex",
fill = "Sex"
) +
theme_minimal(base_size = 13) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1")
The relationship between smoking and mental health differs for men and women. Current smokers tend to report more mentally unhealthy days than non-smokers, but this effect is stronger among women. This means that women who smoke may experience a greater burden of poor mental health compared to men who smoke. Public health interventions targeting smoking cessation could have additional mental health benefits, especially for women. Addressing smoking among women may therefore help reduce disparities in mental well-being.
The results of this analysis suggest both health-related factors and socioeconomic factors play an important role in shaping mental health among US adults. Physically unhealthy days is the strongest predictor of mentally unhealthy days, and explains the contribution of variation in mentally unhealthy days. This highlights the close connection between physical and mental well-being. Age groups also contributes significantly, followed by smoking status and income, indicating that both behavioral and social factors are important. While all predictors are statistically significant in the Type III analysis, some variables such as BMI and exercise had relatively smaller contributions compared to others.
A key limitation of cross-sectional survey data is that exposure and outcome are measured at the same time, making it impossible to determine the direction of relationships or establish causality. For example, it is unclear whether poor physical health leads to worse mental health or vice versa. Additionally, cross-sectional data are vulnerable to recall bias and self-reporting errors, which may affect the accuracy of variables, such as mental health days or health behaviors. Unmeasured confounding is also a concern. For example, access to healthcare could influence both health behaviors (like exercise or smoking) and mental health outcomes, and pre-existing mental health conditions could affect both current health status and reported mentally unhealthy days.
Adjusted R-squared provides a more accurate measure of model performance than regular R-squared because it accounts for the number of predictors included in the model. While R-squared will always increase as more variables are added, adjusted R-squared checks whether the new variables actually improve the model enough to justify making it more complicated. This helps ensure that improvements in model fit reflect meaningful contributions rather than over-fitting. Chunk tests further strengthen interpretation by evaluating the overall contribution of grouped variables such as income and education. Rather than relying solely on individual t-tests for each category, chunk tests assess whether the variable as a whole significantly improves the model. This is particularly useful when some individual categories may not appear statistically significant.
I used AI assistance to help visualize the interaction between sex and smoking status using ggeffects::ggpredict(). When I first created the plot, the lines were not appearing correctly due to issues with grouping and positioning in the code. AI helped identify and fix these issues by adjusting the aesthetics and ensuring the correct grouping structure. I verified the final output by confirming that the plot matched the expected interaction pattern based on my model results.