library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(janitor)
library(car)
library(gtsummary)
library(ggeffects)brfss_raw <- read_xpt("/Users/nataliasmall/Downloads/EPI 553/LLCP2023.XPT ") %>%
clean_names()
#Total number of rows and columns in raw dataset
cat("Total Rows:", nrow(brfss_raw), "\n")## Total Rows: 433323
## Total Columns: 350
brfss_var <- brfss_raw %>%
select(menthlth, physhlth, bmi5, sexvar, exerany2, ageg5yr, incomg1, educa, smoker3)# ── Recode and clean ──────────────────────────────────────────────────────────
brfss_cleaned <- brfss_var %>%
mutate(
# Outcome: mentally unhealthy days in past 30 (88 = none = 0; 77/99 = DK/refused = NA)
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
# Physically unhealthy days in past 30 (88 = none = 0; 77/99 = DK/refused = NA)
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# Body mass index × 100 (divide by 100)
bmi = ifelse(bmi5 == 9999, NA, bmi5 / 100),
# Sex (1 = Male, 2 = Female)
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
# Exercise in past 30 days (1 = Yes, 2 = No)
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
# Age group in 5-year categories (13 levels)
age_group = factor(ageg5yr, levels = 1:13,
labels = c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49",
"50-54", "55-59", "60-64", "65-69", "70-74", "75-79",
"80+")),
# Income (7 levels)
income = factor(incomg1, levels = 1:7,
labels = c("Less than $15,000", "$15,000-$24,999", "$25,000-$34,999", "$35,000-$49,999",
"$50,000-$99,999", "$100,000-$199,999", "$200,000 or more")),
# Education (Highest level of education completed)
education = factor(educa, levels = 1:6,
labels = c("Less than high school", "Less than high school", "High school diploma or GED",
"Some college or technical school", "College graduate",
"Graduate or professional degree")),
# Smoking status (4 levels)
smoking = factor(smoker3, levels = 1:4,
labels = c("Current daily smoker", "Current some-day smoker", "Former smoker", "Never smoker"))
)%>%
select(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking)# Number and percentage of missing observations for menthlth_days and at least two other key variables, before removing any missing values
missing_report <- brfss_cleaned %>%
select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking) %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") %>%
mutate(Pct_Missing = round(100 * (N_Missing / nrow(brfss_cleaned)), 1))
missing_report %>%
kable(caption = "Number And Percentage Of Missing Observations Before Exclusions",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Variable | N_Missing | Pct_Missing |
|---|---|---|
| menthlth_days | 8108 | 1.9 |
| physhlth_days | 10785 | 2.5 |
| bmi | 40535 | 9.4 |
| sex | 0 | 0.0 |
| exercise | 1251 | 0.3 |
| age_group | 7779 | 1.8 |
| income | 86623 | 20.0 |
| education | 2325 | 0.5 |
| smoking | 23062 | 5.3 |
# ── Analytic sample (reproducible random sample) ────────────────────────────
set.seed(1220)
brfss_analytic <- brfss_cleaned |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
# Save
saveRDS(brfss_analytic,
"/Users/nataliasmall/Downloads/EPI 553/brfss_hw3_analytic.rds")
# Report final analytic n
tibble(Metric = c("Observations", "Variables"),
Value = c(nrow(brfss_analytic), ncol(brfss_analytic))) %>%
kable(caption = "Analytic Dataset Dimensions") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Observations | 8000 |
| Variables | 9 |
brfss_analytic |>
select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking) |>
tbl_summary(
by = sex,
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body mass index",
exercise ~ "Any physical activity (past 30 days)",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Highest level of education completed",
smoking ~ "Smoking status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
add_overall() |>
bold_labels() |>
italicize_levels() |>
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 (n = 8,000)**") |>
as_flex_table()Characteristic | N | Overall | Male | Female |
|---|---|---|---|---|
Mentally unhealthy days (past 30) | 8,000 | 4.3 (8.1) | 3.5 (7.5) | 5.1 (8.6) |
Physically unhealthy days (past 30) | 8,000 | 4.4 (8.7) | 4.0 (8.4) | 4.9 (8.9) |
Body mass index | 8,000 | 28.7 (6.5) | 28.7 (6.0) | 28.7 (7.0) |
Any physical activity (past 30 days) | 8,000 | 6,240 (78%) | 3,146 (80%) | 3,094 (76%) |
Age group | 8,000 | |||
18-24 | 406 (5.1%) | 235 (6.0%) | 171 (4.2%) | |
25-29 | 408 (5.1%) | 219 (5.6%) | 189 (4.7%) | |
30-34 | 463 (5.8%) | 253 (6.4%) | 210 (5.2%) | |
35-39 | 565 (7.1%) | 263 (6.7%) | 302 (7.4%) | |
40-44 | 582 (7.3%) | 290 (7.4%) | 292 (7.2%) | |
45-49 | 518 (6.5%) | 266 (6.8%) | 252 (6.2%) | |
50-54 | 608 (7.6%) | 305 (7.7%) | 303 (7.5%) | |
55-59 | 660 (8.3%) | 308 (7.8%) | 352 (8.7%) | |
60-64 | 787 (9.8%) | 408 (10%) | 379 (9.3%) | |
65-69 | 901 (11%) | 418 (11%) | 483 (12%) | |
70-74 | 808 (10%) | 382 (9.7%) | 426 (10%) | |
75-79 | 663 (8.3%) | 325 (8.3%) | 338 (8.3%) | |
80+ | 631 (7.9%) | 264 (6.7%) | 367 (9.0%) | |
Annual household income | 8,000 | |||
Less than $15,000 | 407 (5.1%) | 160 (4.1%) | 247 (6.1%) | |
$15,000-$24,999 | 641 (8.0%) | 271 (6.9%) | 370 (9.1%) | |
$25,000-$34,999 | 871 (11%) | 376 (9.6%) | 495 (12%) | |
$35,000-$49,999 | 1,067 (13%) | 482 (12%) | 585 (14%) | |
$50,000-$99,999 | 2,511 (31%) | 1,251 (32%) | 1,260 (31%) | |
$100,000-$199,999 | 1,865 (23%) | 996 (25%) | 869 (21%) | |
$200,000 or more | 638 (8.0%) | 400 (10%) | 238 (5.9%) | |
Highest level of education completed | 8,000 | |||
Less than high school | 124 (1.6%) | 75 (1.9%) | 49 (1.2%) | |
High school diploma or GED | 252 (3.2%) | 130 (3.3%) | 122 (3.0%) | |
Some college or technical school | 1,827 (23%) | 950 (24%) | 877 (22%) | |
College graduate | 2,138 (27%) | 1,018 (26%) | 1,120 (28%) | |
Graduate or professional degree | 3,659 (46%) | 1,763 (45%) | 1,896 (47%) | |
Smoking status | 8,000 | |||
Current daily smoker | 658 (8.2%) | 339 (8.6%) | 319 (7.8%) | |
Current some-day smoker | 268 (3.4%) | 151 (3.8%) | 117 (2.9%) | |
Former smoker | 2,262 (28%) | 1,207 (31%) | 1,055 (26%) | |
Never smoker | 4,812 (60%) | 2,239 (57%) | 2,573 (63%) | |
1Mean (SD); n (%) | ||||
Research question:What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?
# Fit a multiple linear regression model
model_mlr <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)
# Display the full summary() output
summary(model_mlr)##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65053 0.95407 10.115 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000-$24,999 -1.63703 0.46899 -3.491 0.000485
## income$25,000-$34,999 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000-$49,999 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000-$99,999 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000-$199,999 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000-$24,999 ***
## income$25,000-$34,999 ***
## income$35,000-$49,999 ***
## income$50,000-$99,999 ***
## income$100,000-$199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
# Coefficient table
tidy(model_mlr, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~ round(., 3))) %>%
kable(
caption = "Table 2. Full Model Coefficients",
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 9.651 | 0.954 | 10.115 | 0.000 | 7.780 | 11.521 |
| physhlth_days | 0.266 | 0.010 | 26.384 | 0.000 | 0.246 | 0.285 |
| bmi | 0.033 | 0.013 | 2.527 | 0.012 | 0.007 | 0.059 |
| sexFemale | 1.390 | 0.168 | 8.301 | 0.000 | 1.062 | 1.719 |
| exerciseYes | -0.651 | 0.215 | -3.033 | 0.002 | -1.072 | -0.230 |
| age_group25-29 | -1.057 | 0.519 | -2.034 | 0.042 | -2.075 | -0.038 |
| age_group30-34 | -1.094 | 0.506 | -2.160 | 0.031 | -2.087 | -0.101 |
| age_group35-39 | -1.811 | 0.489 | -3.707 | 0.000 | -2.769 | -0.853 |
| age_group40-44 | -2.893 | 0.487 | -5.935 | 0.000 | -3.849 | -1.937 |
| age_group45-49 | -3.056 | 0.498 | -6.141 | 0.000 | -4.032 | -2.081 |
| age_group50-54 | -3.517 | 0.483 | -7.277 | 0.000 | -4.464 | -2.569 |
| age_group55-59 | -4.496 | 0.476 | -9.454 | 0.000 | -5.428 | -3.564 |
| age_group60-64 | -4.522 | 0.458 | -9.863 | 0.000 | -5.421 | -3.623 |
| age_group65-69 | -5.578 | 0.450 | -12.390 | 0.000 | -6.460 | -4.695 |
| age_group70-74 | -6.025 | 0.457 | -13.173 | 0.000 | -6.922 | -5.129 |
| age_group75-79 | -6.287 | 0.475 | -13.239 | 0.000 | -7.217 | -5.356 |
| age_group80+ | -6.820 | 0.477 | -14.302 | 0.000 | -7.754 | -5.885 |
| income$15,000-$24,999 | -1.637 | 0.469 | -3.491 | 0.000 | -2.556 | -0.718 |
| income$25,000-$34,999 | -2.076 | 0.448 | -4.635 | 0.000 | -2.955 | -1.198 |
| income$35,000-$49,999 | -2.547 | 0.438 | -5.812 | 0.000 | -3.406 | -1.688 |
| income$50,000-$99,999 | -3.050 | 0.411 | -7.428 | 0.000 | -3.856 | -2.245 |
| income$100,000-$199,999 | -3.500 | 0.429 | -8.154 | 0.000 | -4.341 | -2.658 |
| income$200,000 or more | -3.784 | 0.500 | -7.563 | 0.000 | -4.765 | -2.803 |
| educationHigh school diploma or GED | 0.080 | 0.811 | 0.099 | 0.921 | -1.509 | 1.669 |
| educationSome college or technical school | 1.077 | 0.690 | 1.561 | 0.119 | -0.275 | 2.429 |
| educationCollege graduate | 1.791 | 0.691 | 2.591 | 0.010 | 0.436 | 3.146 |
| educationGraduate or professional degree | 1.738 | 0.693 | 2.509 | 0.012 | 0.380 | 3.095 |
| smokingCurrent some-day smoker | -1.587 | 0.535 | -2.965 | 0.003 | -2.636 | -0.538 |
| smokingFormer smoker | -1.990 | 0.337 | -5.902 | 0.000 | -2.651 | -1.329 |
| smokingNever smoker | -2.937 | 0.322 | -9.131 | 0.000 | -3.567 | -2.306 |
Fitted regression equation: Mentally unhealthy days = 9.651 + 0.266(physhlth_days) + 0.033(bmi) + 1.390(sexFemale) + -0.651(exerciseYes) + -1.057(age_group25-29) + -1.094(age_group30-34) + -1.811(age_group35-39) + -2.893(age_group40-44) + -3.056(age_group45-49) + -3.517(age_group50-54) + -4.496(age_group55-59) + -4.522(age_group60-64) + -5.578(age_group65-69) + -6.025(age_group70-74) + -6.287(age_group75-79) + -6.820(age_group80+) + -1.637(income15,000−24,999) + -2.076(income25,000−34,999) + -2.547(income35,000−49,999) + -3.050(income50,000−99,999) + -3.500(income100,000−199,999) + -3.784(income$200,000 or more) + 0.080(educationHigh school diploma or GED) + 1.077(educationSome college or technical school) + 1.791(educationCollege graduate) + 1.738(educationGraduate or professional degree) + -1.587(smokingCurrent some-day smoker) + -1.990(smokingFormer smoker) + -2.937(smokingNever smoker)
physhlth_days (continuous predictor) (𝛽̂ = 0.266) Each additional day of poor physical health is associated with an estimated 0.266 additional mentally unhealthy days on average, holding all other variables constant (95% CI: 0.246 to 0.285).
bmi (continuous predictor) (𝛽̂ = 0.033) Each additional unit increase in bmi is associated with an estimated 0.033 additional mentally unhealthy days on average, holding all other variables constant (95% CI: 0.007 to 0.059).
sex: Female vs. Male (reference) (𝛽̂ = 1.390) Compared to males (the reference group), females report an estimated 1.390 more mentally unhealthy days on average, holding all other variables constant (95% CI: 1.062 to 1.719).
exercise: Yes vs. No (reference) (𝛽̂ = -0.651) People who engaged in physical activity in the past 30 days report an estimated 0.651 fewer mentally unhealthy days compared to those who did not exercise, adjusting for all other covariates (95% CI: -1.072 to -0.230).
age group: 70-74 (𝛽̂ = -6.025) Compared to those aged 18-24 (the reference group), individuals aged 70-74 report an estimated 6.025 fewer mentally unhealthy days on average, holding all other variables constant (95% CI: -6.922 to -5.129).
income: $15,000-24,999 (𝛽̂ = -1.637) Compared to individuals with annual household income less than $15,000 (the reference group), individuals with annual household income between 15,000-24,99 report an estimated 1.637 fewer mentally unhealthy days on average, holding all other variables constant (95% CI: -2.556 to -0.718).
income: $200,000 or more (𝛽̂ = -3.784) Compared to individuals with annual household income less than $15,000 (the reference group), individuals with annual household income 200,000 or more report an estimated 3.784 fewer mentally unhealthy days on average, holding all other variables constant (95% CI: -4.765 to -2.803).
glance(model_mlr) %>%
select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual, nobs) %>%
mutate(across(where(is.numeric), ~ round(., 4))) %>%
pivot_longer(everything(), names_to = "Statistic", values_to = "Value") %>%
mutate(Statistic = dplyr::recode(Statistic,
"r.squared" = "R²",
"adj.r.squared" = "Adjusted R²",
"sigma" = "Residual Std. Error (Root MSE)",
"statistic" = "F-statistic",
"p.value" = "p-value (overall F-test)",
"df" = "Model df (p)",
"df.residual" = "Residual df (n − p − 1)",
"nobs" = "n (observations)"
)) %>%
kable(caption = "Table 3. Overall Model Summary — Full Model") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Statistic | Value |
|---|---|
| R² | 0.1853 |
| Adjusted R² | 0.1824 |
| Residual Std. Error (Root MSE) | 7.3516 |
| F-statistic | 62.5234 |
| p-value (overall F-test) | 0.0000 |
| Model df (p) | 29.0000 |
| Residual df (n − p − 1) | 7970.0000 |
| n (observations) | 8000.0000 |
R-squared:0.1853 Adjusted R-squared:0.1824 Root MSE:7.3516
Null hypothesis:𝐻0:𝛽1=𝛽2=…=𝛽𝑝=0 (none of the predictors are associated with mentally unhealthy days) F-statistic:62.5234 Degrees of freedom:(29,7970) P-value: 0.0000 (<0.001) Conclusion: Since p-value is <0.05 we reject the null hypothesis, indicating that there is statistical evidence that one or more of the predictors are associated with mentally unhealthy days.
# Type III using car::Anova()
anova_type3 <- Anova(model_mlr, type = "III")
# Display the full output
anova_type3 %>%
as.data.frame() %>%
rownames_to_column("Source") %>%
mutate(
Significant = ifelse(`Pr(>F)` < 0.05, "Yes", "No"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table 4. Type III Partial Sums Of Squares For Each Predictor",
col.names = c("Source", "Sum of Sq", "df", "F value", "p-value", "Significant (α = 0.05)")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
column_spec(6, bold = TRUE)| Source | Sum of Sq | df | F value | p-value | Significant (α = 0.05) |
|---|---|---|---|---|---|
| (Intercept) | 5529.7722 | 1 | 102.3152 | 0.0000 | Yes |
| physhlth_days | 37622.7952 | 1 | 696.1198 | 0.0000 | Yes |
| bmi | 345.2408 | 1 | 6.3879 | 0.0115 | Yes |
| sex | 3723.8662 | 1 | 68.9012 | 0.0000 | Yes |
| exercise | 497.0434 | 1 | 9.1966 | 0.0024 | Yes |
| age_group | 30092.1774 | 12 | 46.3986 | 0.0000 | Yes |
| income | 4733.8943 | 6 | 14.5982 | 0.0000 | Yes |
| education | 1265.1504 | 4 | 5.8521 | 0.0001 | Yes |
| smoking | 5101.1076 | 3 | 31.4613 | 0.0000 | Yes |
| Residuals | 430750.0872 | 7970 | NA | NA | NA |
Interpretation:All predictors marked “yes” have statistically significant partial associations at α = 0.05.
# Fit a reduced model that includes all predictors except income
model_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking, data = brfss_analytic)
# Compare the two models
anova(model_no_income, model_mlr) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Reduced (no income)", "Full (with income)"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table 4. Chunk Test: Does income add to the model?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (no income) | 7976 | 435484.0 | NA | NA | NA | NA |
| Full (with income) | 7970 | 430750.1 | 6 | 4733.894 | 14.5982 | 0 |
Null hypothesis:𝐻0:𝛽∗=0 (income does not add to the model given the other variables) Alternative hypothesis:𝐻1:𝛽∗≠0 (income does add to the model given the other variables) F-statistic:14.5982 Degrees of freedom:(6,7970) P-value: 0 (<0.001) Conclusion: Since p-value, 0, is <0.05 we reject the null hypothesis, indicating that there is statistical evidence that income does add to the model given the other variables.
# Fit a reduced model that includes all predictors except income
model_no_edu <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking, data = brfss_analytic)
# Compare the two models
anova(model_no_edu, model_mlr) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Reduced (no education)", "Full (with education)"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table 5. Chunk Test: Does education add to the model?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (no education) | 7974 | 432015.2 | NA | NA | NA | NA |
| Full (with education) | 7970 | 430750.1 | 4 | 1265.15 | 5.8521 | 1e-04 |
Null hypothesis:𝐻0:𝛽∗=0 (education as a group does not add to the model given the other variables) Alternative hypothesis:𝐻1:𝛽∗≠0 (at least one education coefficient adds to the model given the other variables) F-statistic:5.8521 Degrees of freedom:(4,7970) P-value: 1e-04 (<0.001) Conclusion: Since p-value, 1e-04, is <0.05 we reject the null hypothesis, indicating that there is statistical evidence that education as a group does add to the model given the other variables.
Interpretation: After adjusting for all other predictors simultaneously, the type III (partial) f-tests confirmed that each predictor of mentally unhealthy days: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking, are independent statistically significant contributers. Physical health days and smoking status were among the strongest independent contributions. The chunk test for income affirmed that income collectively improves the model significantly, after accounting for all other predictors. Likewise, the chunk test for education affirmed that education as a group collectively improves the model significantly, after accounting for all other predictors. Beyond the individual t-tests from summary(), chunk tests are used for testing groups of variables, confirming whether a group of variables collectively adds to the model.
# Binary smoking variable
brfss_analytic <- brfss_analytic %>%
mutate(
smoker_current = factor(case_when(
smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
), levels = c("Non-smoker", "Current smoker"))
)# Model A (main effects only): regress menthlth_days on physhlth_days, bmi, sex, smoker_current, exercise, age_group, income, and education.
model_a <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education, data = brfss_analytic)
# Model B (with interaction)
model_b <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education, data = brfss_analytic)# Compare the two models
anova(model_a, model_b) |>
tidy() |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(caption = "Test for Coincidence: Does Current Smoking and Mentally Unhealthy Days Differ By Sex?") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| term | df.residual | rss | df | sumsq | statistic | p.value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education | 7972 | 432508.9 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education | 7971 | 432174.9 | 1 | 333.9749 | 6.1598 | 0.0131 |
Null hypothesis:𝐻0:𝛽2=𝛽3 = 0 (smoking-mentally unhealthy days association does not differ by sex) Alternative hypothesis:𝐻𝐴:At least one ≠ 0 (smoking-mentally unhealthy days association differs by sex) F-statistic:6.1598 Degrees of freedom:(1,7971) P-value: 0.0131 (<0.05) Conclusion: Since p-value, 0.0131, is <0.05 we reject the null hypothesis, indicating that there is statistical evidence that the smoking and mentally unhealthy days association differs significantly by sex.
# Model B Coefficient table
tidy(model_b, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~ round(., 3))) %>%
kable(
caption = "Table 6. Model B Coefficients",
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.899 | 0.929 | 7.430 | 0.000 | 5.079 | 8.720 |
| physhlth_days | 0.269 | 0.010 | 26.679 | 0.000 | 0.249 | 0.288 |
| bmi | 0.033 | 0.013 | 2.502 | 0.012 | 0.007 | 0.059 |
| sexFemale | 1.186 | 0.178 | 6.678 | 0.000 | 0.838 | 1.534 |
| smoker_currentCurrent smoker | 1.521 | 0.365 | 4.162 | 0.000 | 0.805 | 2.237 |
| exerciseYes | -0.673 | 0.215 | -3.130 | 0.002 | -1.094 | -0.251 |
| age_group25-29 | -0.920 | 0.520 | -1.771 | 0.077 | -1.939 | 0.098 |
| age_group30-34 | -0.892 | 0.506 | -1.764 | 0.078 | -1.884 | 0.099 |
| age_group35-39 | -1.593 | 0.488 | -3.267 | 0.001 | -2.549 | -0.637 |
| age_group40-44 | -2.629 | 0.486 | -5.413 | 0.000 | -3.581 | -1.677 |
| age_group45-49 | -2.843 | 0.497 | -5.721 | 0.000 | -3.817 | -1.869 |
| age_group50-54 | -3.278 | 0.482 | -6.801 | 0.000 | -4.223 | -2.333 |
| age_group55-59 | -4.250 | 0.474 | -8.965 | 0.000 | -5.179 | -3.321 |
| age_group60-64 | -4.264 | 0.457 | -9.336 | 0.000 | -5.159 | -3.369 |
| age_group65-69 | -5.251 | 0.447 | -11.756 | 0.000 | -6.126 | -4.375 |
| age_group70-74 | -5.711 | 0.454 | -12.569 | 0.000 | -6.602 | -4.820 |
| age_group75-79 | -5.908 | 0.470 | -12.565 | 0.000 | -6.829 | -4.986 |
| age_group80+ | -6.499 | 0.474 | -13.724 | 0.000 | -7.428 | -5.571 |
| income$15,000-$24,999 | -1.636 | 0.470 | -3.480 | 0.001 | -2.557 | -0.714 |
| income$25,000-$34,999 | -2.075 | 0.449 | -4.624 | 0.000 | -2.954 | -1.195 |
| income$35,000-$49,999 | -2.546 | 0.439 | -5.796 | 0.000 | -3.406 | -1.685 |
| income$50,000-$99,999 | -3.043 | 0.412 | -7.394 | 0.000 | -3.850 | -2.236 |
| income$100,000-$199,999 | -3.510 | 0.430 | -8.162 | 0.000 | -4.353 | -2.667 |
| income$200,000 or more | -3.840 | 0.501 | -7.665 | 0.000 | -4.823 | -2.858 |
| educationHigh school diploma or GED | 0.126 | 0.812 | 0.155 | 0.877 | -1.467 | 1.718 |
| educationSome college or technical school | 1.118 | 0.691 | 1.617 | 0.106 | -0.237 | 2.473 |
| educationCollege graduate | 1.818 | 0.693 | 2.624 | 0.009 | 0.460 | 3.176 |
| educationGraduate or professional degree | 1.669 | 0.694 | 2.404 | 0.016 | 0.308 | 3.030 |
| sexFemale:smoker_currentCurrent smoker | 1.283 | 0.517 | 2.482 | 0.013 | 0.270 | 2.297 |
Estimated effect among men:Males who identify as current smokers report an estimated 1.521 more mentally unhealthy days on average compared to males who do not identify as current smokers, holding all other variables constant.
Estimated effect among women:The estimated effect among women (1.521 + 1.283 = 2.804). Women who identify as current smokers report an estimated 2.804 more mentally unhealthy days on average compared to women who do not identify as current smokers, holding all other variables constant.
What this means together:Smoking appears to be more strongly associated with poor mental health in women than in men. Women who identify as current smokers report 1.283 more mentally unhealthy days on average, compared to men who. identify as current smokers, holding all other variables constant.
# 5: Visualize interaction using ggeffects::ggpredict
ggpredict(model_b, terms = c("smoker_current", "sex")) %>%
plot() +
labs(
title = "Predicted Mental Health Days by Current Smoking Status and Sex",
x = "Current Smoking Status",
y = "Predicted Mentally Unhealthy Days",
color = "Sex", fill = "Sex"
) +
theme_minimal(base_size = 13)Interpretation: Among U.S. adults, individuals regardless of sex who identify as non-smokers are predicted on average to have less mentally unhealthy days, compared to individuals who identify as current smokers. However, the strength of this association does appear to differ by sex. There is a striking dichotomy between males who identity as non-smokers mentally unhealthy days compared to males who identity as current smokers mentally unhealthy days, and females who identity as non-smokers mentally unhealthy days compared to females who identity as current smokers mentally unhealthy days. Women who smoke experience a larger gap in mental health burden compared to women who do not smoke, while this is also true for men the gap in mental health burden is larger among women. These findings suggest that public health initiatives aimed at addressing tobacco use and mental health may benefit from sex-specific approaches. However, since this analysis is cross-sectional, we cannot determine causality. —
The results suggest the determinants of mental health among U.S. adults are associated with various socioeconomic, behavioral, and lifestyle factors. Physical health days emerged as the strongest predictor, consistent with the documented bidirectional relationship between physical and mental burden. Among U.S. adults, individuals regardless of sex who identify as non-smokers are predicted on average to have less mentally unhealthy days, compared to individuals who identify as current smokers. Lower socioeconomic status (income and education) was associated with higher mental health burden, confirming the role of SDOH on population mental health. BMI was a predictor with a weaker association. Limitations of using cross-sectional survey data is that we cannot determine causality. Also, since BRFSS data is collected through random digit dialing techniques on both landlines and cell phones, data is susceptible to recall bias. Potential confounders include pre-diagnosed mental health disorders and undisclosed substance addictions; for instance, individuals in recovery may smoke more to manage withdrawal, potentially inflating the smoking-mental health association.
R-squared always increases (or stays the same) when you add a predictor, even if that predictor is random noise. It cannot be used to compare models with different numbers of predictors. Unlike R-squared, adjusted R-squared penalizes for the number of predictors. It only increases when a new predictor improves the model by more than chance alone. This distinction is important when adding multiple categorical predictors because it will confirm whether a predictor contributes to the model significantly. It is useful to test groups of predictors (income, education) collectively with chunk tests, because unlike individual t-tests, which evaluate a single coefficient, chunk tests simultaneously assess whether a group of coefficients contributes significantly to the model’s predictive power.
I used Gemini on part 0 when creating the missing report to inquire about the pivot_longer statement. The lecture notes on multiple linear regression showed an example on assessing missingness in our analytic variables before the sample was taken, but the output table had two columns. There was also an example on how to use pivot_longer in the “Tests of Hypotheses in Multiple Linear Regression” lecture, but similarly it was also only two columns. Since I knew that I wanted my output table to show 3 columns: Variable, N_Missing, and Pct_Missing, I asked Gemini if there was a way to format the statement so that 3 columns would print and not 2. I also searched on Stack Overflow. Ultimately I decided to keep the “summarise(across(everything(), ~ sum(is.na(.)))) %>% pivot_longer(everything(), names_to =”Variable”, values_to = “N_Missing”) %>%” which was consistent with past lecture notes, and then in the mutate statement add my calculation code, which displayed as the third column.
End of HW Assignment