#Research Question: What individual and behavioral factors are associated with the number of mentally unhealthy days reported by U.S. adults, and does the association between current smoking and mental health differ by sex?
#Part 0: Data Acquisition and Preparation Download Packages
library (tidyverse)
library (kableExtra)
library (car)
library (broom)
library (haven)
library (dplyr)
library (janitor)
library (gtsummary)
library (ggeffects)
Import Data
brfss_2023_full <- read_xpt(
"~/Documents/HEPI553/data/LLCP2023.XPT "
) |>
clean_names()
Select 9 Variables; Recode
brfss_subset <- brfss_2023_full |>
mutate(
# Mental health days
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth %in% c(77, 99) ~ NA_real_,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
# Physical unhealth days
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth %in% c(77, 99) ~ NA_real_,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# BMI
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
TRUE ~ as.numeric(bmi5) / 100
),
# Sex
sex = factor(case_when(
sexvar == 1 ~ "Male",
sexvar == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")),
# Exercise
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
exerany2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No","Yes")),
# Age group
age_group = factor(case_when(
ageg5yr == 1 ~ "18-24",
ageg5yr == 2 ~ "25-29",
ageg5yr == 3 ~ "30-34",
ageg5yr == 4 ~ "35-39",
ageg5yr == 5 ~ "40-44",
ageg5yr == 6 ~ "45-49",
ageg5yr == 7 ~ "50-54",
ageg5yr == 8 ~ "55-59",
ageg5yr == 9 ~ "60-64",
ageg5yr == 10 ~ "65-69",
ageg5yr == 11 ~ "70-74",
ageg5yr == 12 ~ "75-79",
ageg5yr == 13 ~ "80+",
ageg5yr == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels=c("18-24","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80+")),
# Income
income = factor(case_when(
incomg1 == 1 ~ "Less than $15,000",
incomg1 == 2 ~ "$15,000 to $24,999",
incomg1 == 3 ~ "$25,000 to $34,999",
incomg1 == 4 ~ "$35,000 to $49,999",
incomg1 == 5 ~ "$50,000 to $99,999",
incomg1 == 6 ~ "$100,000 to $199,999",
incomg1 == 7 ~ "$200,000 or more",
incomg1 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $15,000","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $99,999","$100,000 to $199,999","$200,000 or more")),
# Education
education = factor(case_when(
educa %in% c(1, 2) ~ "Less than high school",
educa == 3 ~ "High school diploma or GED",
educa == 4 ~ "Some college or technical school",
educa == 5 ~ "College graduate",
educa == 6 ~ "Graduate or professional degree",
educa == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than high school","High school diploma or GED","Some college or technical school","College graduate","Graduate or professional degree")),
# Smoking
smoking = factor(case_when(
smoker3 == 1 ~ "Current daily smoker",
smoker3 == 2 ~ "Current some-day smoker",
smoker3 == 3 ~ "Former smoker",
smoker3 == 4 ~ "Never smoker",
smoker3 == 9 ~ NA_character_
),
levels = c("Current daily smoker","Current some-day smoker","Former smoker","Never smoker"))
) |>
select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking)
Report Missing Observations
cat("Missing values in analytic dataset:", sum(!complete.cases(brfss_subset)), "\n")
## Missing values in analytic dataset: 122555
cat("All models are fit on the same n =", nrow(brfss_subset), "observations.\n")
## All models are fit on the same n = 433323 observations.
cat("Missing menthlth_days:", sum(is.na(brfss_subset$menthlth_days)),
"(", round(mean(is.na(brfss_subset$menthlth_days)) * 100, 1), "%)\n")
## Missing menthlth_days: 8108 ( 1.9 %)
cat("Missing physhlth_days:", sum(is.na(brfss_subset$physhlth_days)),
"(", round(mean(is.na(brfss_subset$physhlth_days)) * 100, 1), "%)\n")
## Missing physhlth_days: 10785 ( 2.5 %)
cat("Missing income:", sum(is.na(brfss_subset$income)),
"(", round(mean(is.na(brfss_subset$income)) * 100, 1), "%)\n")
## Missing income: 86623 ( 20 %)
Removing Observations With Missing Values
set.seed(1220)
brfss_analytic <- brfss_subset |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
Descriptive Statistics Table, Stratified by Sex
brfss_analytic |>
select(menthlth_days, physhlth_days, age_group, sex, education, exercise, smoking, bmi, income) |>
tbl_summary(
by = sex,
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
age_group ~ "Age (years)",
education ~ "Education level",
exercise ~ "Any physical activity (past 30 days)",
smoking ~ "Smoker Status",
bmi ~ "BMI Status",
income ~ "Income Status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
bold_labels() |>
italicize_levels() |>
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Statified by Sex**") |>
as_gt()
| Characteristic | N | Male N = 3,9361 |
Female N = 4,0641 |
|---|---|---|---|
| Mentally unhealthy days (past 30) | 8,000 | 3.5 (7.5) | 5.1 (8.6) |
| Physically unhealthy days (past 30) | 8,000 | 4.0 (8.4) | 4.9 (8.9) |
| Age (years) | 8,000 | ||
| 18-24 | 235 (6.0%) | 171 (4.2%) | |
| 25-29 | 219 (5.6%) | 189 (4.7%) | |
| 30-34 | 253 (6.4%) | 210 (5.2%) | |
| 35-39 | 263 (6.7%) | 302 (7.4%) | |
| 40-44 | 290 (7.4%) | 292 (7.2%) | |
| 45-49 | 266 (6.8%) | 252 (6.2%) | |
| 50-54 | 305 (7.7%) | 303 (7.5%) | |
| 55-59 | 308 (7.8%) | 352 (8.7%) | |
| 60-64 | 408 (10%) | 379 (9.3%) | |
| 65-69 | 418 (11%) | 483 (12%) | |
| 70-74 | 382 (9.7%) | 426 (10%) | |
| 75-79 | 325 (8.3%) | 338 (8.3%) | |
| 80+ | 264 (6.7%) | 367 (9.0%) | |
| Education level | 8,000 | ||
| Less than high school | 75 (1.9%) | 49 (1.2%) | |
| High school diploma or GED | 130 (3.3%) | 122 (3.0%) | |
| Some college or technical school | 950 (24%) | 877 (22%) | |
| College graduate | 1,018 (26%) | 1,120 (28%) | |
| Graduate or professional degree | 1,763 (45%) | 1,896 (47%) | |
| Any physical activity (past 30 days) | 8,000 | 3,146 (80%) | 3,094 (76%) |
| Smoker Status | 8,000 | ||
| Current daily smoker | 339 (8.6%) | 319 (7.8%) | |
| Current some-day smoker | 151 (3.8%) | 117 (2.9%) | |
| Former smoker | 1,207 (31%) | 1,055 (26%) | |
| Never smoker | 2,239 (57%) | 2,573 (63%) | |
| BMI Status | 8,000 | 28.7 (6.0) | 28.7 (7.0) |
| Income Status | 8,000 | ||
| Less than $15,000 | 160 (4.1%) | 247 (6.1%) | |
| $15,000 to $24,999 | 271 (6.9%) | 370 (9.1%) | |
| $25,000 to $34,999 | 376 (9.6%) | 495 (12%) | |
| $35,000 to $49,999 | 482 (12%) | 585 (14%) | |
| $50,000 to $99,999 | 1,251 (32%) | 1,260 (31%) | |
| $100,000 to $199,999 | 996 (25%) | 869 (21%) | |
| $200,000 or more | 400 (10%) | 238 (5.9%) | |
| 1 Mean (SD); n (%) | |||
#Part 1: Multiple Linear Regression Research Question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?
Step 1: Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output.
#MLR Model
m1 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)
#Summary of MLR Model
summary(m1)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65053 0.95407 10.115 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000 to $24,999 -1.63703 0.46899 -3.491 0.000485
## income$25,000 to $34,999 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000 to $49,999 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000 to $99,999 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000 to $199,999 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to $24,999 ***
## income$25,000 to $34,999 ***
## income$35,000 to $49,999 ***
## income$50,000 to $99,999 ***
## income$100,000 to $199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
tidy(m1, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Naive Model",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 9.6505 | 0.9541 | 10.1151 | 0.0000 | 7.7803 | 11.5208 |
| physhlth_days | 0.2656 | 0.0101 | 26.3841 | 0.0000 | 0.2459 | 0.2853 |
| bmi | 0.0334 | 0.0132 | 2.5274 | 0.0115 | 0.0075 | 0.0593 |
| sexFemale | 1.3904 | 0.1675 | 8.3007 | 0.0000 | 1.0620 | 1.7187 |
| exerciseYes | -0.6512 | 0.2147 | -3.0326 | 0.0024 | -1.0721 | -0.2303 |
| age_group25-29 | -1.0566 | 0.5194 | -2.0343 | 0.0420 | -2.0747 | -0.0385 |
| age_group30-34 | -1.0939 | 0.5065 | -2.1600 | 0.0308 | -2.0867 | -0.1012 |
| age_group35-39 | -1.8110 | 0.4885 | -3.7072 | 0.0002 | -2.7686 | -0.8534 |
| age_group40-44 | -2.8931 | 0.4875 | -5.9346 | 0.0000 | -3.8487 | -1.9375 |
| age_group45-49 | -3.0562 | 0.4977 | -6.1408 | 0.0000 | -4.0318 | -2.0806 |
| age_group50-54 | -3.5167 | 0.4832 | -7.2775 | 0.0000 | -4.4640 | -2.5695 |
| age_group55-59 | -4.4960 | 0.4755 | -9.4543 | 0.0000 | -5.4282 | -3.5638 |
| age_group60-64 | -4.5221 | 0.4585 | -9.8633 | 0.0000 | -5.4209 | -3.6234 |
| age_group65-69 | -5.5779 | 0.4502 | -12.3903 | 0.0000 | -6.4604 | -4.6955 |
| age_group70-74 | -6.0254 | 0.4574 | -13.1728 | 0.0000 | -6.9220 | -5.1287 |
| age_group75-79 | -6.2866 | 0.4748 | -13.2392 | 0.0000 | -7.2174 | -5.3557 |
| age_group80+ | -6.8197 | 0.4768 | -14.3019 | 0.0000 | -7.7544 | -5.8850 |
| income$15,000 to $24,999 | -1.6370 | 0.4690 | -3.4905 | 0.0005 | -2.5564 | -0.7177 |
| income$25,000 to $34,999 | -2.0764 | 0.4480 | -4.6351 | 0.0000 | -2.9545 | -1.1982 |
| income$35,000 to $49,999 | -2.5469 | 0.4382 | -5.8122 | 0.0000 | -3.4058 | -1.6879 |
| income$50,000 to $99,999 | -3.0505 | 0.4107 | -7.4277 | 0.0000 | -3.8555 | -2.2454 |
| income$100,000 to $199,999 | -3.4998 | 0.4292 | -8.1537 | 0.0000 | -4.3413 | -2.6584 |
| income$200,000 or more | -3.7841 | 0.5004 | -7.5628 | 0.0000 | -4.7649 | -2.8033 |
| educationHigh school diploma or GED | 0.0799 | 0.8107 | 0.0986 | 0.9215 | -1.5092 | 1.6690 |
| educationSome college or technical school | 1.0768 | 0.6897 | 1.5612 | 0.1185 | -0.2753 | 2.4288 |
| educationCollege graduate | 1.7909 | 0.6912 | 2.5911 | 0.0096 | 0.4360 | 3.1458 |
| educationGraduate or professional degree | 1.7378 | 0.6925 | 2.5095 | 0.0121 | 0.3803 | 3.0953 |
| smokingCurrent some-day smoker | -1.5867 | 0.5352 | -2.9645 | 0.0030 | -2.6359 | -0.5375 |
| smokingFormer smoker | -1.9897 | 0.3371 | -5.9020 | 0.0000 | -2.6506 | -1.3289 |
| smokingNever smoker | -2.9368 | 0.3216 | -9.1313 | 0.0000 | -3.5673 | -2.3063 |
Step 2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places.
Mental Unhealthy Days = 9.651 + 0.267 (physthlth_days) + 0.033 (bmi) + 1.390 (sex) - 0.651 (exercise) - 1.057 (age_group25-29) - 1.094 (age_group30-34) - 1.811 (age_group35-39) - 2.893 (age_group40-44) - 0.356 (age_group45-49) - 3.517 (age_group50-54) - 4.496 (age_group55-59) - 4.522 (age_group60-64) -5.578 (age_group65-69) - 6.025 (age_group70-74) - 6.287 (age_group75-79) - 6.820 (age_group80+) - 1.637 (income 15,000-24,999) - 2.076 ( income 25,000-34,999) - 2.547 (income 35,000-49,999) - 3.051 (income 50,000-99,999) - 3.500 (income 100,000-199,999) - 3.784 (income 200,000 or more) + 0.080 (educationHigh school diploma or GED) + 1.077 (educationSome college or technical school) + 1.791 (educationCollege graduate) + 1.738 (educationGraduate or professional degree) - 1.587 (smokingCurrent some-day smoker) - 1.990 (smokingFormer smoker) - 2.937 (smokingNever smoker)
Step 3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant
Physhlth_days (continuous predictor) = Each additional day of poor physical health is associated with an estimated 0.267 additional mentally unhealthy day on average, holding bmi, sex, exercise,age and income constant.
Bmi (continuous predictor) = Each one-unit increase in the BMI category is associated with 0.033 fewer mentally unhealthy days on average, holding all other variables constant.
Sex = Female vs. Male (reference) = Compared to males (the reference group), females report an estimated 1.390 more mentally unhealthy days on average, holding all other variables constant.
Exercise = Yes vs. No (reference) = People who engaged in any physical activity in the past 30 days report an estimated - 0.651 fewer mentally unhealthy days compared to those who did not exercise, adjusting for all other covariates, holding all of the other variables constant.
Age_group60-64 = Each additional year of age is associated with - 4.522 fewer mentally unhealthy days on average, holding all else constant. This counter intuitive finding is well-documented — older adults often report fewer mental health difficulties, possibly due to better emotion regulation, survivor bias, or cohort effects.
Income15,000 -24,999 = Each one-unit increase in the income category (15,000 -24,999) is associated with - 1.637 fewer mentally unhealthy days on average, consistent with the well-established socioeconomic gradient in mental health, holding all other variables constant.
Income25,000 - 34,999 = Each one-unit increase in the income category (25,000 - 34,999) is associated with - 2.076 fewer mentally unhealthy days on average, consistent with the well-established socioeconomic gradient in mental health, holding all other variables constant.
Intercept = The estimated mean mental health days for a person with zero physically unhealthy days, zero bmi, sex, zero exercise, age group 60-64, smoking status, income 15,000-24,999 and income 25,000-34,999. This is a mathematical artifact — not a meaningful quantity in context.
Step 4: Report and interpret each of the following model-level statistics:
glance(m1) %>%
select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual, nobs) %>%
pivot_longer(everything(), names_to = "Metric", values_to = "Value") %>%
mutate(
Value = round(Value, 4),
Metric = dplyr::recode(Metric,
"r.squared" = "R²",
"adj.r.squared" = "Adjusted R²",
"sigma" = "Root MSE (σ̂)",
"statistic" = "F-statistic (overall)",
"p.value" = "p-value (overall F-test)",
"df" = "Model df (p)",
"df.residual" = "Residual df (n − p − 1)",
"nobs" = "n (observations)"
)
) %>%
kable(caption = "Table 2. Overall Model Summary") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Metric | Value |
|---|---|
| R² | 0.1853 |
| Adjusted R² | 0.1824 |
| Root MSE (σ̂) |
7.351
|
• R-squared: proportion of variance in mentally unhealthy days explained by all predictors
R^2: 0.1853 There is a 19% variance in mentally unhealthy days explained by all predictors.
• Adjusted R-squared: how it differs from R-squared and what it adds
R^2 adjusted: 0.1824 The R-squared is slightly higher compared to the adjusted, conveying certain predictors were penalized.
• Root MSE (residual standard error): average prediction error of the model
Root MSE: 7.351 There is a 7.4% standard error for the average prediction of the model.
• Overall F-test:
H0: Mental unhealthy days are not associated with all of the predictors H1: Mental unhealthy days are associated with all of the predictors
F-statistic: 62.52 Degrees of freedom (numerator and denominator): 29 P-value: 0
Conclusion: Since the p-value is less than 0.05, we reject the null hypothesis. It is statistically significant and there is an association between mental unhealthy days and all the predictors (physical unhealthy days, bmi, age, sex, income, exercise and education)
#Part 2: Test of Hypotheses
Step 1: Obtain Type III (partial) sums of squares for all predictors using car::Anova(model, type = “III”). Display the full output. Identify which predictors have statistically significant partial associations at α = 0.05.
anova_3 <- Anova(m1, type = "III")
anova_3 |>
tidy() |>
mutate(
across(where(is.numeric), \(x) round(x, 4)),
`Significant (alpha = 0.05)` = ifelse(p.value < 0.05, "Yes", "No")
) |>
kable(
caption = "Table 3: Type III Partial Sum of Squares - Testing All Predictor's Contribution",
col.names = c("Predictor","Sum of Sq","df","F value","p-value","Significant?")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
column_spec(6, bold = TRUE)
| Predictor | Sum of Sq | df | F value | p-value | Significant? |
|---|---|---|---|---|---|
| (Intercept) | 5529.7722 | 1 | 102.3152 | 0.0000 | Yes |
| physhlth_days | 37622.7952 | 1 | 696.1198 | 0.0000 | Yes |
| bmi | 345.2408 | 1 | 6.3879 | 0.0115 | Yes |
| sex | 3723.8662 | 1 | 68.9012 | 0.0000 | Yes |
| exercise | 497.0434 | 1 | 9.1966 | 0.0024 | Yes |
| age_group | 30092.1774 | 12 | 46.3986 | 0.0000 | Yes |
| income | 4733.8943 | 6 | 14.5982 | 0.0000 | Yes |
| education | 1265.1504 | 4 | 5.8521 | 0.0001 | Yes |
| smoking | 5101.1076 | 3 | 31.4613 | 0.0000 | Yes |
| Residuals | 430750.0872 | 7970 | NA | NA | NA |
Interpretation: The predictors that are statistically significant are physhlth_days, bmi, sex, exercise, age_group, income, education and smoking.
Step 2 (Chunk test for income): Test whether income collectively improves the model significantly, after accounting for all other predictors:
# Reduced model: only income
m_reduced <- lm(menthlth_days ~ physhlth_days + bmi + exercise + age_group + education +smoking, data = brfss_analytic)
# Full model: income + demographics
m_full <- lm(menthlth_days ~ physhlth_days + income + bmi + exercise + age_group + education +smoking,
data = brfss_analytic)
# Chunk test
anova(m_reduced, m_full) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Reduced (all predictors except income)", "Full (+ income)"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table 4. Chunk Test: Do demographic variables collectively add to the model?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (all predictors except income) | 7977 | 440345.3 | NA | NA | NA | NA |
| Full (+ income) | 7971 | 434474.0 | 6 | 5871.359 | 17.953 | 0 |
H0: Income is not a key predictor for the model H1: Income is a key predictor for the model
F-statistic: 17.953 Degrees of freedom: 6 P-value: 0
State your conclusion at α = 0.05: Since the p-value is less than 0.05, we reject the null hypothesis. It is statistically significant and there is an association between income and the model.
Step 3 (Chunk test for education): Repeat the same procedure for education as a group of predictors.
# Reduced model: only education
m_reduced <- lm(menthlth_days ~ physhlth_days + bmi + exercise + age_group + income +smoking, data = brfss_analytic)
# Full model: education + demographics
m_full <- lm(menthlth_days ~ physhlth_days + income + bmi + exercise + age_group + education +smoking,
data = brfss_analytic)
# Chunk test
anova(m_reduced, m_full) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Reduced (all predictors except education)", "Full (+ education)"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table 5. Chunk Test: Do demographic variables collectively add to the model?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (all predictors except education) | 7975 | 436108 | NA | NA | NA | NA |
| Full (+ education) | 7971 | 434474 | 4 | 1634.012 | 7.4945 | 0 |
H0: Education is not a key predictor for the model H1: Education is a key predictor for the model
F-statistic: 7.4945 Degrees of freedom:4 P-value: 0
State your conclusion at α = 0.05: Since the p-value is less than 0.05, we reject the null hypothesis. It is statistically significant and there is an association between education and the model.
Step 4: Write a 3 to 5 sentence synthesis interpreting the Type III results and your chunk test findings. Which predictors make the strongest independent contributions? What do the chunk tests add beyond the individual t-tests from summary()?
For the Type III, physhlth_days, income, bmi, exercise, age_group, education, and smoking are significant. This means the predictors adds to the prediction of mental unhealthy days. By using the sum of squares Type III, we are able to determine if a certain variable contributes to the model given all other variables. With the chunk test, we are able to determine that both predictors income and education are statistically significant since the p-value is less than 0.05 after adjusting for the other variables. The predictor with the strongest independent contributions are physical unhealthy days. The chunk tests add beyond the individual t-tests by conveying the significance from the group of variables rather than the coefficients.
#Part 3: Interaction Anlysis Background: Effect modification occurs when the association between a predictor and an outcome differs across levels of another variable. You will test whether the association between current smoking and mentally unhealthy days differs between men and women.
Step 1: Create a binary smoking variable called smoker_current: • “Current smoker” includes current daily smokers and current some-day smokers. • “Non-smoker” includes former smokers and never smokers. Use “Non-smoker” as the reference category.
brfss_analytic <- brfss_analytic %>%
mutate(
smoker_current = case_when(
smoking %in% c("Current some-day smoker", "Current daily smoker") ~ "Current smoker",
smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
),
smoker_current = factor(smoker_current,
levels = c("Non-smoker", "Current smoker"))
)
Step 2: Fit two models: • Model A (main effects only): regress menthlth_days on physhlth_days, bmi, sex, smoker_current, exercise, age_group, income, and education. • Model B (with interaction): same as Model A, but use sex * smoker_current in the formula to include the interaction term.
# The maximum model with all candidate predictors
mod_a <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education,
data = brfss_analytic)
tidy(mod_a, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Model A: Main Effects Only",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.7553 | 0.9271 | 7.2869 | 0.0000 | 4.9381 | 8.5726 |
| physhlth_days | 0.2686 | 0.0101 | 26.6731 | 0.0000 | 0.2489 | 0.2884 |
| bmi | 0.0334 | 0.0132 | 2.5250 | 0.0116 | 0.0075 | 0.0593 |
| sexFemale | 1.3331 | 0.1673 | 7.9675 | 0.0000 | 1.0051 | 1.6611 |
| smoker_currentCurrent smoker | 2.1287 | 0.2712 | 7.8489 | 0.0000 | 1.5971 | 2.6604 |
| exerciseYes | -0.6725 | 0.2150 | -3.1274 | 0.0018 | -1.0940 | -0.2510 |
| age_group25-29 | -0.9149 | 0.5198 | -1.7602 | 0.0784 | -1.9338 | 0.1040 |
| age_group30-34 | -0.8823 | 0.5061 | -1.7434 | 0.0813 | -1.8743 | 0.1097 |
| age_group35-39 | -1.5810 | 0.4877 | -3.2421 | 0.0012 | -2.5369 | -0.6251 |
| age_group40-44 | -2.6157 | 0.4858 | -5.3847 | 0.0000 | -3.5680 | -1.6635 |
| age_group45-49 | -2.8246 | 0.4970 | -5.6836 | 0.0000 | -3.7988 | -1.8504 |
| age_group50-54 | -3.2600 | 0.4821 | -6.7628 | 0.0000 | -4.2050 | -2.3151 |
| age_group55-59 | -4.2301 | 0.4741 | -8.9219 | 0.0000 | -5.1595 | -3.3007 |
| age_group60-64 | -4.2484 | 0.4568 | -9.3000 | 0.0000 | -5.1439 | -3.3529 |
| age_group65-69 | -5.2338 | 0.4467 | -11.7163 | 0.0000 | -6.1095 | -4.3582 |
| age_group70-74 | -5.7023 | 0.4545 | -12.5457 | 0.0000 | -6.5933 | -4.8113 |
| age_group75-79 | -5.8977 | 0.4703 | -12.5401 | 0.0000 | -6.8197 | -4.9758 |
| age_group80+ | -6.4888 | 0.4737 | -13.6975 | 0.0000 | -7.4174 | -5.5602 |
| income$15,000 to $24,999 | -1.6797 | 0.4698 | -3.5754 | 0.0004 | -2.6007 | -0.7588 |
| income$25,000 to $34,999 | -2.1023 | 0.4486 | -4.6861 | 0.0000 | -2.9817 | -1.2229 |
| income$35,000 to $49,999 | -2.5869 | 0.4390 | -5.8931 | 0.0000 | -3.4474 | -1.7264 |
| income$50,000 to $99,999 | -3.0823 | 0.4114 | -7.4921 | 0.0000 | -3.8887 | -2.2758 |
| income$100,000 to $199,999 | -3.5360 | 0.4300 | -8.2232 | 0.0000 | -4.3789 | -2.6931 |
| income$200,000 or more | -3.8625 | 0.5011 | -7.7078 | 0.0000 | -4.8448 | -2.8802 |
| educationHigh school diploma or GED | 0.2139 | 0.8119 | 0.2634 | 0.7922 | -1.3776 | 1.8053 |
| educationSome college or technical school | 1.1965 | 0.6907 | 1.7323 | 0.0833 | -0.1575 | 2.5505 |
| educationCollege graduate | 1.9035 | 0.6922 | 2.7499 | 0.0060 | 0.5466 | 3.2604 |
| educationGraduate or professional degree | 1.7456 | 0.6938 | 2.5160 | 0.0119 | 0.3856 | 3.1057 |
# Compare a small model to the maximum model
mod_b <- lm(menthlth_days ~ physhlth_days + bmi + exercise + age_group + income + education + sex * smoker_current, data = brfss_analytic)
anova(mod_a, mod_b) |>
tidy() |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(caption = "Model B: With Interaction ") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| term | df.residual | rss | df | sumsq | statistic | p.value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education | 7972 | 432508.9 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + exercise + age_group + income + education + sex * smoker_current | 7971 | 432174.9 | 1 | 333.9749 | 6.1598 | 0.0131 |
Step 3: Test whether the interaction is statistically significant:
H0: Predictor sex stratified by smoker_current is not associated with mental unhealthy days H1: Predictor sex stratified by smoker_current is associated with mental unhealthy days
F-statistic: 6.1598 Degrees of freedom: 1 P-value:.0131
State your conclusion at α = 0.05: Since the p-value is less than 0.05, we reject the null hypothesis. It is statistically significant, there is an association between the predictor sex stratified by current smoker and the outcome, mental unhealthy days.
Step 4: Interpret the interaction using the coefficients from Model B:
4a:Report the estimated effect of current smoking on mentally unhealthy days among men (the coefficient for smoker_current). Interpret in plain language.
The estimated effect of current smoking among men have a 2.129 times higher chance of developing mentally unhealthy days compared to women. This signifies that men have a statistically significant change of developing the outcome.
4b:Compute and report the estimated effect among women (smoker_current coefficient plus the interaction coefficient). Interpret in plain language.
The estimated effect of current smoking among women have a 1.333 times lower chance of developing mentally unhealthy days compared to men. This signifies that women do not have a statistically significant change of developing the outcome.
4c: In 2 to 3 sentences, explain what these estimates mean together: is smoking more strongly associated with poor mental health in one sex than the other, and by how much?
When comparing both model A and model B, the estimates reveal that smoking more is strongly associated with poor mental health days in males compared to females. As both have p-values less than 0.05, the estimate for the coefficient for smoker_current which is for males have a higher statistic compared to the smoker_current plus interaction effect which is female. This conveys that one sex has a stronger association between the predictor and outcome.
Step 5: Visualize the interaction using ggeffects::ggpredict(model_B, terms = c(“smoker_current”, “sex”)). The plot should show predicted mentally unhealthy days for each combination of smoking status and sex, with labeled axes, a descriptive title, and a legend identifying men and women.
pred_int <- ggpredict(mod_b, terms = c("smoker_current", "sex"))
ggplot(pred_int, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_point (size = 2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, color = NA) +
labs(
title = "Predicted Mental Health Days by Smoking and Sex",
subtitle = "From interaction model: smoker_current * sex",
x = "smoker_current",
y = "Mental Unhealthy Days",
color = "Sex",
fill = "Sex"
) +
theme_minimal(base_size = 13) +
scale_color_brewer(palette = "Set1", labels = c("Men", "Women")) +
scale_fill_brewer(palette = "Set1", labels = c("Men", "Women"))
Step 6: Write 3 to 5 sentences interpreting the interaction for a
general public health audience. Do not include coefficient values or
p-values. Focus on the substantive finding and its implications.
When predicting mentally unhealthy days by current smoking status and sex, men who are non-smokers have a lower chance of developing higher mentally unhealthy days compared to women who are non-smokers. Women who are current smokers have a higher chance of developing more mentally unhealthy days compared to men who are current smokers. The data indicates that when analyzing smoking status stratified by sex, women have a higher chance of developing the outcome, mental unhealthy days compared to men.
#Part 4: Reflection
A. Findings and Limitations After analyzing the results, there are certain predictors that give adults a higher chance of developing mentally unhealthy days in the United States. For example, physical unhealthy days, sex, exercise, age_group, income, certain education status and smoking status convey significant results for developing the outcome. It is also important to recognize that focusing on income, education and current smoking status, they show a higher chance of mentally unhealthy days compared to the other predictors used. The bmi, education High school diploma or GED and education Some college or technical school did not have significant results. A key limitation for using cross-sectional survey data is certain biases, such as selection bias. As we have used these specific predictors, there might be other variables that do not represent the whole population. If we misrepresent the population, this can lead to inaccurate results. Another limitation to using this data is the challenge of determining causative relationships. Since this data was used from the BRFSS 2023, it is using a single time period. This might be harder to identify which predictors are correlated with each other for the overall outcome. Two specific confounders that could bias the associations I estimated could be specific health conditions and environmental factors that can disrupt mentally unhealthy days.
B. From Simple to Multiple Regression The R-squared tells us the proportion of total variance in Y when analyzing the model while the adjusted R-squared penalizes the number of predictors p. As shown the adjusted r-squared value is slightly less than the original r-squared value. The adjustment allows the individual to see how other predictors can affect the overall outcome by taking out certain variables. This is important when adding multiple categorical predictors to provide accurate results while comparing both statistics. It is important to test groups of predictors collectively with chunk tests because the chunk tests convey the significance from the group of variables rather than the coefficients themselves.
C. AI Transparency
Chunk Tests: I had a difficult time reading how to code the question. I used my reduce mode, full model and chunk test code from the lecture slides but I had a hard time putting in the right variables. Chat informed me that I should take out income for the reduced model but leave income in for the full model.
Binary Smoking Variable: I started with writing out the code for a binary variable but I had a hard time trying to figure out how to call them current and non smokers. I typed out the parts of the code chat provided into my original code to give me my results.
Two Models: After using my code from a previous lecture, I could not understand why my degrees of freedom were -1. I asked chat and it informed me I just needed to switch my model from mod_b, mod_a to mod_a, mod_b for my interaction model.
Visual Model: I used a code from the previous lecture but when I ran it, it did not show my plotted points. I asked chat what was wrong. It informed me I needed to add the geom_point line and the scale_color_brewer / scale_fill_brewers lines. So I added those new lines to my original code.