library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)
brfss_raw <- read_xpt("/Users/karoekor/Downloads/data files/LLCP2023.XPT ")
brfss_sel <- brfss_raw |>
select(
MENTHLTH,
PHYSHLTH,
`_BMI5`,
SEXVAR,
EXERANY2,
`_AGEG5YR`,
`_INCOMG1`,
EDUCA,
`_SMOKER3`
)
brfss_clean <- brfss_sel |>
mutate(
# Outcome and continuous
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH %in% c(77, 99) ~ NA_real_,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_
),
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH %in% c(77, 99) ~ NA_real_,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_
),
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_,
TRUE ~ as.numeric(`_BMI5`) / 100
),
# Binary
sex = factor(
case_when(SEXVAR == 1 ~ "Male", SEXVAR == 2 ~ "Female", TRUE ~ NA_character_),
levels = c("Male", "Female")
),
exercise = factor(
case_when(EXERANY2 == 1 ~ "Yes", EXERANY2 == 2 ~ "No",
EXERANY2 %in% c(7, 9) ~ NA_character_, TRUE ~ NA_character_),
levels = c("No", "Yes")
),
# Age group (13 levels)
age_group = factor(
case_when(
`_AGEG5YR` == 1 ~ "18-24",
`_AGEG5YR` == 2 ~ "25-29",
`_AGEG5YR` == 3 ~ "30-34",
`_AGEG5YR` == 4 ~ "35-39",
`_AGEG5YR` == 5 ~ "40-44",
`_AGEG5YR` == 6 ~ "45-49",
`_AGEG5YR` == 7 ~ "50-54",
`_AGEG5YR` == 8 ~ "55-59",
`_AGEG5YR` == 9 ~ "60-64",
`_AGEG5YR` == 10 ~ "65-69",
`_AGEG5YR` == 11 ~ "70-74",
`_AGEG5YR` == 12 ~ "75-79",
`_AGEG5YR` == 13 ~ "80+",
`_AGEG5YR` == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("18-24","25-29","30-34","35-39","40-44","45-49",
"50-54","55-59","60-64","65-69","70-74","75-79","80+")
),
# Income (7 levels)
income = factor(
case_when(
`_INCOMG1` == 1 ~ "Less than $15,000",
`_INCOMG1` == 2 ~ "$15,000–$24,999",
`_INCOMG1` == 3 ~ "$25,000–$34,999",
`_INCOMG1` == 4 ~ "$35,000–$49,999",
`_INCOMG1` == 5 ~ "$50,000–$99,999",
`_INCOMG1` == 6 ~ "$100,000–$199,999",
`_INCOMG1` == 7 ~ "$200,000 or more",
`_INCOMG1` == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $15,000","$15,000–$24,999","$25,000–$34,999",
"$35,000–$49,999","$50,000–$99,999","$100,000–$199,999","$200,000 or more")
),
# Education (5 levels)
education = factor(
case_when(
EDUCA %in% c(1, 2) ~ "Less than high school",
EDUCA == 3 ~ "High school diploma or GED",
EDUCA == 4 ~ "Some college or technical school",
EDUCA == 5 ~ "College graduate",
EDUCA == 6 ~ "Graduate or professional degree",
EDUCA == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than high school","High school diploma or GED",
"Some college or technical school","College graduate",
"Graduate or professional degree")
),
# Smoking (4 levels)
smoking = factor(
case_when(
`_SMOKER3` == 1 ~ "Current daily smoker",
`_SMOKER3` == 2 ~ "Current some-day smoker",
`_SMOKER3` == 3 ~ "Former smoker",
`_SMOKER3` == 4 ~ "Never smoker",
`_SMOKER3` == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Never smoker","Former smoker","Current some-day smoker","Current daily smoker")
)
)
miss_report <- brfss_clean |>
select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking) |>
summarise(across(everything(), list(
n_missing = ~sum(is.na(.)),
pct_missing = ~round(mean(is.na(.)) * 100, 1)
))) |>
pivot_longer(everything(), names_to = c("variable", ".value"), names_sep = "_(?=[^_]+$)")
miss_report |>
kable(caption = "Missing values before exclusions") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| variable | missing |
|---|---|
| menthlth_days_n | 8108.0 |
| menthlth_days_pct | 1.9 |
| physhlth_days_n | 10785.0 |
| physhlth_days_pct | 2.5 |
| bmi_n | 40535.0 |
| bmi_pct | 9.4 |
| sex_n | 0.0 |
| sex_pct | 0.0 |
| exercise_n | 1251.0 |
| exercise_pct | 0.3 |
| age_group_n | 7779.0 |
| age_group_pct | 1.8 |
| income_n | 86623.0 |
| income_pct | 20.0 |
| education_n | 2325.0 |
| education_pct | 0.5 |
| smoking_n | 23062.0 |
| smoking_pct | 5.3 |
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
cat("Final analytic n:", nrow(brfss_analytic), "\n")
## Final analytic n: 8000
brfss_analytic |>
select(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
tbl_summary(
by = sex,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
label = list(
menthlth_days ~ "Mentally unhealthy days",
physhlth_days ~ "Physically unhealthy days",
bmi ~ "BMI",
exercise ~ "Any exercise (past 30 days)",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Education level",
smoking ~ "Smoking status"
)
) |>
add_overall() |>
bold_labels()
| Characteristic | Overall N = 8,0001 |
Male N = 3,9361 |
Female N = 4,0641 |
|---|---|---|---|
| Mentally unhealthy days | 4 (8) | 4 (8) | 5 (9) |
| Physically unhealthy days | 4 (9) | 4 (8) | 5 (9) |
| BMI | 29 (7) | 29 (6) | 29 (7) |
| Any exercise (past 30 days) | 6,240 (78%) | 3,146 (80%) | 3,094 (76%) |
| Age group | |||
| 18-24 | 406 (5.1%) | 235 (6.0%) | 171 (4.2%) |
| 25-29 | 408 (5.1%) | 219 (5.6%) | 189 (4.7%) |
| 30-34 | 463 (5.8%) | 253 (6.4%) | 210 (5.2%) |
| 35-39 | 565 (7.1%) | 263 (6.7%) | 302 (7.4%) |
| 40-44 | 582 (7.3%) | 290 (7.4%) | 292 (7.2%) |
| 45-49 | 518 (6.5%) | 266 (6.8%) | 252 (6.2%) |
| 50-54 | 608 (7.6%) | 305 (7.7%) | 303 (7.5%) |
| 55-59 | 660 (8.3%) | 308 (7.8%) | 352 (8.7%) |
| 60-64 | 787 (9.8%) | 408 (10%) | 379 (9.3%) |
| 65-69 | 901 (11%) | 418 (11%) | 483 (12%) |
| 70-74 | 808 (10%) | 382 (9.7%) | 426 (10%) |
| 75-79 | 663 (8.3%) | 325 (8.3%) | 338 (8.3%) |
| 80+ | 631 (7.9%) | 264 (6.7%) | 367 (9.0%) |
| Annual household income | |||
| Less than $15,000 | 407 (5.1%) | 160 (4.1%) | 247 (6.1%) |
| $15,000–$24,999 | 641 (8.0%) | 271 (6.9%) | 370 (9.1%) |
| $25,000–$34,999 | 871 (11%) | 376 (9.6%) | 495 (12%) |
| $35,000–$49,999 | 1,067 (13%) | 482 (12%) | 585 (14%) |
| $50,000–$99,999 | 2,511 (31%) | 1,251 (32%) | 1,260 (31%) |
| $100,000–$199,999 | 1,865 (23%) | 996 (25%) | 869 (21%) |
| $200,000 or more | 638 (8.0%) | 400 (10%) | 238 (5.9%) |
| Education level | |||
| Less than high school | 124 (1.6%) | 75 (1.9%) | 49 (1.2%) |
| High school diploma or GED | 252 (3.2%) | 130 (3.3%) | 122 (3.0%) |
| Some college or technical school | 1,827 (23%) | 950 (24%) | 877 (22%) |
| College graduate | 2,138 (27%) | 1,018 (26%) | 1,120 (28%) |
| Graduate or professional degree | 3,659 (46%) | 1,763 (45%) | 1,896 (47%) |
| Smoking status | |||
| Never smoker | 4,812 (60%) | 2,239 (57%) | 2,573 (63%) |
| Former smoker | 2,262 (28%) | 1,207 (31%) | 1,055 (26%) |
| Current some-day smoker | 268 (3.4%) | 151 (3.8%) | 117 (2.9%) |
| Current daily smoker | 658 (8.2%) | 339 (8.6%) | 319 (7.8%) |
| 1 Mean (SD); n (%) | |||
model_full <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
summary(model_full)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.71372 0.92549 7.254 4.42e-13
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000–$24,999 -1.63703 0.46899 -3.491 0.000485
## income$25,000–$34,999 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000–$49,999 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000–$99,999 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000–$199,999 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingFormer smoker 0.94710 0.19329 4.900 9.77e-07
## smokingCurrent some-day smoker 1.35011 0.46820 2.884 0.003942
## smokingCurrent daily smoker 2.93681 0.32162 9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000–$24,999 ***
## income$25,000–$34,999 ***
## income$35,000–$49,999 ***
## income$50,000–$99,999 ***
## income$100,000–$199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingFormer smoker ***
## smokingCurrent some-day smoker **
## smokingCurrent daily smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
round(coef(model_full), 3)
## (Intercept)
## 6.714
## physhlth_days
## 0.266
## bmi
## 0.033
## sexFemale
## 1.390
## exerciseYes
## -0.651
## age_group25-29
## -1.057
## age_group30-34
## -1.094
## age_group35-39
## -1.811
## age_group40-44
## -2.893
## age_group45-49
## -3.056
## age_group50-54
## -3.517
## age_group55-59
## -4.496
## age_group60-64
## -4.522
## age_group65-69
## -5.578
## age_group70-74
## -6.025
## age_group75-79
## -6.287
## age_group80+
## -6.820
## income$15,000–$24,999
## -1.637
## income$25,000–$34,999
## -2.076
## income$35,000–$49,999
## -2.547
## income$50,000–$99,999
## -3.050
## income$100,000–$199,999
## -3.500
## income$200,000 or more
## -3.784
## educationHigh school diploma or GED
## 0.080
## educationSome college or technical school
## 1.077
## educationCollege graduate
## 1.791
## educationGraduate or professional degree
## 1.738
## smokingFormer smoker
## 0.947
## smokingCurrent some-day smoker
## 1.350
## smokingCurrent daily smoker
## 2.937
glance(model_full) |>
select(r.squared, adj.r.squared, sigma, statistic, df, df.residual, p.value) |>
kable(digits = 4, caption = "Model-level fit statistics") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| r.squared | adj.r.squared | sigma | statistic | df | df.residual | p.value |
|---|---|---|---|---|---|---|
| 0.1853 | 0.1824 | 7.3516 | 62.5234 | 29 | 7970 | 0 |
\[\hat{Y} = 6.714 + 0.266(\text{physhlth\_days}) + 0.033(\text{bmi}) + 1.390(\text{sexFemale}) - 0.651(\text{exerciseYes}) - 1.057(\text{age}_{25-29}) - 1.094(\text{age}_{30-34}) - 1.811(\text{age}_{35-39}) - 2.893(\text{age}_{40-44}) - 3.056(\text{age}_{45-49}) - 3.517(\text{age}_{50-54}) - 4.496(\text{age}_{55-59}) - 4.522(\text{age}_{60-64}) - 5.578(\text{age}_{65-69}) - 6.025(\text{age}_{70-74}) - 6.287(\text{age}_{75-79}) - 6.820(\text{age}_{80+}) - 1.637(\text{income}_{\$15-24k}) - 2.076(\text{income}_{\$25-34k}) - 2.547(\text{income}_{\$35-49k}) - 3.050(\text{income}_{\$50-99k}) - 3.500(\text{income}_{\$100-199k}) - 3.784(\text{income}_{\$200k+}) + 0.080(\text{educ\_HS}) + 1.077(\text{educ\_SomeCollege}) + 1.791(\text{educ\_CollegeGrad}) + 1.738(\text{educ\_GradDeg}) + 0.947(\text{smoking\_Former}) + 1.350(\text{smoking\_SomeDay}) + 2.937(\text{smoking\_Daily})\]
physhlth_days (β = 0.266): For each additional physically unhealthy day in the past 30, the number of mentally unhealthy days is estimated to be 0.266 days higher, holding all other variables constant.
bmi (β = 0.033): For each one-unit increase in BMI, the number of mentally unhealthy days is estimated to be 0.033 days higher, holding all other variables constant.
sexFemale (β = 1.390): Females are estimated to report 1.390 more mentally unhealthy days in the past 30 compared to males (the reference category), holding all other variables constant.
exerciseYes (β = −0.651): Those who exercised in the past 30 days are estimated to report 0.651 fewer mentally unhealthy days compared to those who did not exercise (the reference category), holding all other variables constant.
age_group60–64 (β = −4.496): Adults aged 60–64 are estimated to report 4.496 fewer mentally unhealthy days compared to those aged 18–24 (the reference category), holding all other variables constant.
income$50,000–$99,999 (β = −3.050): Adults with annual household incomes of $50,000–$99,999 are estimated to report 3.050 fewer mentally unhealthy days compared to those earning less than $15,000 (the reference category), holding all other variables constant.
income$200,000 or more (β = −3.784): Adults with annual household incomes of $200,000 or more are estimated to report 3.784 fewer mentally unhealthy days compared to those earning less than $15,000 (the reference category), holding all other variables constant.
R-squared = 0.1853: The model explains 18.53% of the variance in mentally unhealthy days across the 8,000 respondents.
Adjusted R-squared = 0.1824: After penalizing for the 29 predictors in the model, 18.24% of variance is explained. The small difference from R² (0.0029) indicates the predictors collectively contribute meaningful explanatory power and the model is not substantially overfitted.
Root MSE (Residual Standard Error) = 7.352: On average, the model’s predicted values of mentally unhealthy days deviate from the observed values by approximately 7.35 days.
Overall F-test: The null hypothesis (H₀) states that all regression coefficients are simultaneously equal to zero — that is, none of the predictors is associated with mentally unhealthy days. The F-statistic is 62.52 on 29 (numerator) and 7970 (denominator) degrees of freedom, with p < 2.2e-16. We reject H₀ and conclude that the model as a whole explains a statistically significant amount of variance in mentally unhealthy days (p < 0.001).
# Type III partial F-tests for all predictors
car::Anova(model_full, type = "III")
## Anova Table (Type III tests)
##
## Response: menthlth_days
## Sum Sq Df F value Pr(>F)
## (Intercept) 2844 1 52.6240 4.418e-13 ***
## physhlth_days 37623 1 696.1198 < 2.2e-16 ***
## bmi 345 1 6.3879 0.0115095 *
## sex 3724 1 68.9012 < 2.2e-16 ***
## exercise 497 1 9.1966 0.0024325 **
## age_group 30092 12 46.3986 < 2.2e-16 ***
## income 4734 6 14.5982 < 2.2e-16 ***
## education 1265 4 5.8521 0.0001064 ***
## smoking 5101 3 31.4613 < 2.2e-16 ***
## Residuals 430750 7970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Reduced model: all predictors EXCEPT income
model_no_income <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + education + smoking,
data = brfss_analytic
)
# Chunk test: does income improve model fit?
anova(model_no_income, model_full)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7976 435484
## 2 7970 430750 6 4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Reduced model: all predictors EXCEPT education
model_no_educ <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + smoking,
data = brfss_analytic
)
# Chunk test: does education improve model fit?
anova(model_no_educ, model_full)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7974 432015
## 2 7970 430750 4 1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Type III ANOVA table shows partial F-tests for each predictor after accounting for all others. At α = 0.05, all predictors are statistically significant: physhlth_days (F = 696.12, p < 2.2e-16), bmi (F = 6.39, p = 0.0115), sex (F = 68.90, p < 2.2e-16), exercise (F = 9.20, p = 0.0024), age_group (F = 46.40, p < 2.2e-16), income (F = 14.60, p < 2.2e-16), education (F = 5.85, p = 0.0001), and smoking (F = 31.46, p < 2.2e-16).
H₀: The six income dummy variable coefficients are
simultaneously equal to zero — income as a group does not improve model
fit beyond the other predictors.
Hₐ: At least one income coefficient differs from zero —
income collectively improves model fit.
F = 14.598, df = 6, p < 2.2e-16. We reject H₀ and conclude that
income, as a group of predictors, is statistically significantly
associated with mentally unhealthy days after adjusting for all other
variables in the model (p < 0.001).
H₀: The four education dummy variable coefficients
are simultaneously equal to zero — education as a group does not improve
model fit beyond the other predictors.
Hₐ: At least one education coefficient differs from
zero.
F = 5.8521, df = 4, p = 0.0001064. We reject H₀ and conclude that
education level, as a group of predictors, is statistically
significantly associated with mentally unhealthy days after adjusting
for all other variables in the model (p < 0.001).
The Type III results indicate that every predictor in the model has a
statistically significant independent association with mentally
unhealthy days after adjusting for all other variables. Physical health
burden (physhlth_days), sex, age group, and smoking status show the
strongest partial associations, as reflected by their large F-values,
while bmi and education make smaller but still significant
contributions. The chunk tests for income and education add an important
layer of inference beyond the individual t-tests from
summary(): rather than evaluating each dummy variable level
in isolation, they test whether a multi-level categorical variable as a
whole improves model fit. This is especially valuable because a given
dummy level may appear non-significant individually while the variable
collectively contributes meaningful explanatory power — as confirmed
here, where both income and education pass the chunk test decisively
despite some non-significant individual levels.
brfss_analytic <- brfss_analytic |>
mutate(
smoker_current = factor(
case_when(
smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
),
levels = c("Non-smoker", "Current smoker") # Non-smoker is reference
)
)
table(brfss_analytic$smoker_current, useNA = "ifany")
##
## Non-smoker Current smoker
## 7074 926
# Model A: main effects only
model_A <- lm(
menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
summary(model_A)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2913 -3.8732 -1.6219 0.6681 30.4937
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.75533 0.92706 7.287 3.48e-13
## physhlth_days 0.26862 0.01007 26.673 < 2e-16
## bmi 0.03341 0.01323 2.525 0.011589
## sexFemale 1.33309 0.16732 7.967 1.84e-15
## smoker_currentCurrent smoker 2.12874 0.27121 7.849 4.74e-15
## exerciseYes -0.67248 0.21503 -3.127 0.001770
## age_group25-29 -0.91492 0.51978 -1.760 0.078412
## age_group30-34 -0.88226 0.50606 -1.743 0.081301
## age_group35-39 -1.58102 0.48765 -3.242 0.001191
## age_group40-44 -2.61572 0.48577 -5.385 7.46e-08
## age_group45-49 -2.82462 0.49698 -5.684 1.37e-08
## age_group50-54 -3.26004 0.48206 -6.763 1.45e-11
## age_group55-59 -4.23010 0.47413 -8.922 < 2e-16
## age_group60-64 -4.24840 0.45682 -9.300 < 2e-16
## age_group65-69 -5.23383 0.44671 -11.716 < 2e-16
## age_group70-74 -5.70233 0.45453 -12.546 < 2e-16
## age_group75-79 -5.89774 0.47031 -12.540 < 2e-16
## age_group80+ -6.48879 0.47372 -13.697 < 2e-16
## income$15,000–$24,999 -1.67974 0.46981 -3.575 0.000352
## income$25,000–$34,999 -2.10230 0.44863 -4.686 2.83e-06
## income$35,000–$49,999 -2.58691 0.43898 -5.893 3.95e-09
## income$50,000–$99,999 -3.08227 0.41140 -7.492 7.50e-14
## income$100,000–$199,999 -3.53598 0.43000 -8.223 2.30e-16
## income$200,000 or more -3.86251 0.50112 -7.708 1.43e-14
## educationHigh school diploma or GED 0.21386 0.81186 0.263 0.792237
## educationSome college or technical school 1.19653 0.69073 1.732 0.083264
## educationCollege graduate 1.90349 0.69219 2.750 0.005974
## educationGraduate or professional degree 1.74564 0.69382 2.516 0.011890
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## smoker_currentCurrent smoker ***
## exerciseYes **
## age_group25-29 .
## age_group30-34 .
## age_group35-39 **
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000–$24,999 ***
## income$25,000–$34,999 ***
## income$35,000–$49,999 ***
## income$50,000–$99,999 ***
## income$100,000–$199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school .
## educationCollege graduate **
## educationGraduate or professional degree *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.366 on 7972 degrees of freedom
## Multiple R-squared: 0.182, Adjusted R-squared: 0.1792
## F-statistic: 65.7 on 27 and 7972 DF, p-value: < 2.2e-16
# Model B: with sex * smoker_current interaction
model_B <- lm(
menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
summary(model_B)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.337 -3.837 -1.604 0.628 30.426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.89938 0.92857 7.430 1.20e-13
## physhlth_days 0.26859 0.01007 26.679 < 2e-16
## bmi 0.03309 0.01323 2.502 0.012380
## sexFemale 1.18553 0.17752 6.678 2.58e-11
## smoker_currentCurrent smoker 1.52079 0.36539 4.162 3.19e-05
## exerciseYes -0.67285 0.21496 -3.130 0.001753
## age_group25-29 -0.92018 0.51962 -1.771 0.076616
## age_group30-34 -0.89242 0.50591 -1.764 0.077774
## age_group35-39 -1.59287 0.48752 -3.267 0.001090
## age_group40-44 -2.62864 0.48564 -5.413 6.39e-08
## age_group45-49 -2.84255 0.49687 -5.721 1.10e-08
## age_group50-54 -3.27784 0.48195 -6.801 1.11e-11
## age_group55-59 -4.24987 0.47404 -8.965 < 2e-16
## age_group60-64 -4.26404 0.45671 -9.336 < 2e-16
## age_group65-69 -5.25062 0.44662 -11.756 < 2e-16
## age_group70-74 -5.71106 0.45439 -12.569 < 2e-16
## age_group75-79 -5.90758 0.47018 -12.565 < 2e-16
## age_group80+ -6.49946 0.47359 -13.724 < 2e-16
## income$15,000–$24,999 -1.63574 0.46999 -3.480 0.000503
## income$25,000–$34,999 -2.07457 0.44862 -4.624 3.82e-06
## income$35,000–$49,999 -2.54551 0.43915 -5.796 7.03e-09
## income$50,000–$99,999 -3.04298 0.41157 -7.394 1.57e-13
## income$100,000–$199,999 -3.50972 0.42999 -8.162 3.79e-16
## income$200,000 or more -3.84047 0.50103 -7.665 2.00e-14
## educationHigh school diploma or GED 0.12556 0.81238 0.155 0.877177
## educationSome college or technical school 1.11789 0.69123 1.617 0.105865
## educationCollege graduate 1.81788 0.69283 2.624 0.008711
## educationGraduate or professional degree 1.66905 0.69428 2.404 0.016240
## sexFemale:smoker_currentCurrent smoker 1.28327 0.51705 2.482 0.013089
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## smoker_currentCurrent smoker ***
## exerciseYes **
## age_group25-29 .
## age_group30-34 .
## age_group35-39 **
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000–$24,999 ***
## income$25,000–$34,999 ***
## income$35,000–$49,999 ***
## income$50,000–$99,999 ***
## income$100,000–$199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## sexFemale:smoker_currentCurrent smoker *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared: 0.1826, Adjusted R-squared: 0.1798
## F-statistic: 63.61 on 28 and 7971 DF, p-value: < 2.2e-16
# Partial F-test: does the interaction term improve model fit?
anova(model_A, model_B)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7972 432509
## 2 7971 432175 1 333.97 6.1598 0.01309 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extract key coefficients for interpretation
coefs_B <- coef(model_B)
# Effect of current smoking among Males (reference sex)
beta_smoker <- coefs_B["smoker_currentCurrent smoker"]
# Interaction coefficient
beta_interaction <- coefs_B["sexFemale:smoker_currentCurrent smoker"]
# Effect among Females = beta_smoker + beta_interaction
effect_female <- beta_smoker + beta_interaction
cat("Effect of current smoking among Males:", round(beta_smoker, 3), "days\n")
## Effect of current smoking among Males: 1.521 days
cat("Interaction coefficient (Female × Current smoker):", round(beta_interaction, 3), "\n")
## Interaction coefficient (Female × Current smoker): 1.283
cat("Effect of current smoking among Females:", round(effect_female, 3), "days\n")
## Effect of current smoking among Females: 2.804 days
# Visualize interaction with ggeffects
pred <- ggpredict(model_B, terms = c("smoker_current", "sex"))
plot(pred) +
labs(
title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
x = "Smoking Status",
y = "Predicted Mentally Unhealthy Days (past 30)",
color = "Sex"
) +
theme_bw()
H₀: The association between current smoking and
mentally unhealthy days does not differ by sex (the interaction
coefficient equals zero).
Hₐ: The association between current smoking and
mentally unhealthy days differs between males and females.
F = 6.1598, df = 1, p = 0.01309. We reject H₀ at α = 0.05 and conclude
that sex statistically significantly modifies the association between
current smoking and mentally unhealthy days (p = 0.013).
Effect among males (β = 1.521): Among males, current smokers are estimated to report 1.521 more mentally unhealthy days in the past 30 compared to non-smokers, holding all other variables constant.
Interaction coefficient (β = 1.283): The association between current smoking and mentally unhealthy days is estimated to be 1.283 days larger among females than among males, holding all other variables constant.
Effect among females (1.521 + 1.283 = 2.804): Among females, current smokers are estimated to report 2.804 more mentally unhealthy days compared to non-smokers, holding all other variables constant.
The estimated association between current smoking and poor mental health is nearly twice as large among females (2.804 days) as among males (1.521 days), a difference of 1.283 days. This indicates that sex modifies the relationship between smoking and mental health, with smoking being more strongly associated with psychological distress among women than men in this sample.
Among U.S. adults, current smokers tend to report more days of poor mental health than non-smokers, but this pattern differs meaningfully between men and women. Women who currently smoke appear to experience a substantially greater burden of poor mental health days compared to female non-smokers, while the same difference for men, though still present, is considerably smaller. These findings suggest that tobacco use and psychological distress may be more closely intertwined in women’s health than in men’s, potentially reflecting differences in stress-related smoking motivations or the psychological effects of nicotine across sexes. Public health programs that simultaneously address smoking cessation and mental health support — particularly among women — may be especially effective in reducing the overall burden of mental health distress in the population.
The results suggest that mental health among U.S. adults is shaped by a complex interplay of physical, behavioral, and socioeconomic factors. Physical health burden was the strongest predictor: each additional physically unhealthy day was associated with approximately 0.27 more mentally unhealthy days, a finding consistent with the well-documented bidirectional relationship between physical and mental health. Smoking status also showed strong associations, with current daily smokers reporting nearly 3 additional mentally unhealthy days compared to never smokers, after adjusting for all other predictors. Age was inversely associated with mentally unhealthy days — older adults consistently reported fewer such days compared to the youngest group (18–24), which may reflect cohort differences in distress reporting, coping capacity, or health expectations. Income and education were significant predictors as well, with higher-income respondents reporting substantially fewer mentally unhealthy days than those earning less than $15,000.
A key limitation of this analysis is its cross-sectional design: because exposure and outcome are measured at the same time point, it is not possible to establish the direction of associations. For example, it is unclear whether poor physical health leads to worse mental health or vice versa. At least two important confounders may bias the associations estimated here. First, chronic disease burden is not adjusted for in this model, individuals with conditions such as diabetes or heart disease likely have elevated rates of both physical and mental health problems, which could confound the observed associations between predictors like BMI or exercise and mentally unhealthy days. Second, social support and living situation are unmeasured factors that influence mental health and are also correlated with income, exercise behavior, and smoking status, potentially biasing several of the estimated coefficients.
Adjusted R² differs from regular R² in that it penalizes for the
number of predictors in the model, decreasing when added predictors do
not improve fit proportionally. In this model with 29 terms, the small
difference between R² (0.1853) and Adjusted R² (0.1824) suggests that
the predictors collectively contribute meaningful explanatory power
rather than simply inflating fit artificially. This distinction is
especially important when adding multi-level categorical predictors such
as age group (12 dummies) and income (6 dummies), because each
additional dummy consumes a degree of freedom and R² will increase
mechanically even for uninformative predictors. The chunk tests for
income and education complement the individual t-tests from
summary() by evaluating each categorical variable as a
whole: a given dummy level may appear non-significant individually while
the variable as a group still contributes meaningfully to the model.
This is exactly what the chunk test is designed to detect, and both
tests confirmed statistically significant collective contributions from
income and education beyond what individual-level t-tests alone would
reveal.
All R code for this assignment was written and executed independently. Examples from class lecture files and additional online resources were used to troubleshoot file path errors when loading the BRFSS XPT file in RStudio Desktop on macOS, including identifying issues such as a trailing space in the filename that prevented the file from loading. These same resources were also used to guide interpretation of results for Parts 1–3, including coefficient interpretations, hypothesis test conclusions, interaction effects, and the public health summary. Output accuracy was verified by cross-checking R console results with written interpretations prior to submission.