knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.width = 10,
fig.height = 6,
cache = FALSE
)
library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(GGally)
library(car)
library(ggeffects)
library(plotly)
library(lmtest)
library(dplyr)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
brfss_2023 <- read_xpt(
"C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2023.XPT"
) |>
clean_names()
brfss_A3 <- brfss_2023 |>
mutate(
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth %in% c(77, 99) ~ NA_real_,
menthlth >= 1 & menthlth <= 30 ~ menthlth
),
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth %in% c(77, 99) ~ NA_real_,
physhlth >= 1 & physhlth <= 30 ~ physhlth
),
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
TRUE ~ bmi5 / 100
),
sex = factor(case_when(
sexvar == 1 ~ "Male",
sexvar == 2 ~ "Female",
TRUE ~ NA_character_
)),
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
exerany2 %in% c(7, 9) ~ NA_character_
)),
age_group = factor(case_when(
ageg5yr == 1 ~ "18–24",
ageg5yr == 2 ~ "25–29",
ageg5yr == 3 ~ "30–34",
ageg5yr == 4 ~ "35–39",
ageg5yr == 5 ~ "40–44",
ageg5yr == 6 ~ "45–49",
ageg5yr == 7 ~ "50–54",
ageg5yr == 8 ~ "55–59",
ageg5yr == 9 ~ "60–64",
ageg5yr == 10 ~ "65–69",
ageg5yr == 11 ~ "70–74",
ageg5yr == 12 ~ "75–79",
ageg5yr == 13 ~ "80+",
ageg5yr == 14 ~ NA_character_
)),
income = factor(case_when(
incomg1 == 1 ~ "Less than $15,000",
incomg1 == 2 ~ "$15,000–$24,999",
incomg1 == 3 ~ "$25,000–$34,999",
incomg1 == 4 ~ "$35,000–$49,999",
incomg1 == 5 ~ "$50,000–$99,999",
incomg1 == 6 ~ "$100,000–$199,999",
incomg1 == 7 ~ "$200,000 or more",
incomg1 == 9 ~ NA_character_
)),
education = factor(case_when(
educa %in% c(1, 2) ~ "Less than high school",
educa == 3 ~ "High school diploma or GED",
educa == 4 ~ "Some college or technical school",
educa == 5 ~ "College graduate",
educa == 6 ~ "Graduate or professional degree",
educa == 9 ~ NA_character_
)),
smoking = factor(case_when(
smoker3 == 1 ~ "Current daily smoker",
smoker3 == 2 ~ "Current some-day smoker",
smoker3 == 3 ~ "Former smoker",
smoker3 == 4 ~ "Never smoker",
smoker3 == 9 ~ NA_character_
))
)
brfss_A3 |>
select(menthlth_days, physhlth_days, bmi, age_group, income, sex, education, smoking, exercise) |>
tbl_summary(
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body mass index",
age_group ~ "Age group",
income ~ "Household income",
sex ~ "Sex",
exercise ~ "Any physical activity (past 30 days)",
education ~ "Highest level of education completed",
smoking ~ "Smoking status (4 levels)"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
bold_labels() |>
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023**")
| Characteristic | N | N = 433,3231 |
|---|---|---|
| Mentally unhealthy days (past 30) | 425,215 | 4.4 (8.3) |
| Physically unhealthy days (past 30) | 422,538 | 4.5 (8.8) |
| Body mass index | 392,788 | 28.5 (6.5) |
| Age group | 425,544 | |
| 18–24 | 26,280 (6.2%) | |
| 25–29 | 21,247 (5.0%) | |
| 30–34 | 24,803 (5.8%) | |
| 35–39 | 27,153 (6.4%) | |
| 40–44 | 28,463 (6.7%) | |
| 45–49 | 27,070 (6.4%) | |
| 50–54 | 31,291 (7.4%) | |
| 55–59 | 34,219 (8.0%) | |
| 60–64 | 41,974 (9.9%) | |
| 65–69 | 46,099 (11%) | |
| 70–74 | 43,533 (10%) | |
| 75–79 | 34,543 (8.1%) | |
| 80+ | 38,869 (9.1%) | |
| Household income | 346,700 | |
| $100,000–$199,999 | 76,637 (22%) | |
| $15,000–$24,999 | 31,069 (9.0%) | |
| $200,000 or more | 26,770 (7.7%) | |
| $25,000–$34,999 | 38,508 (11%) | |
| $35,000–$49,999 | 47,502 (14%) | |
| $50,000–$99,999 | 107,027 (31%) | |
| Less than $15,000 | 19,187 (5.5%) | |
| Sex | 433,323 | |
| Female | 229,541 (53%) | |
| Male | 203,782 (47%) | |
| Highest level of education completed | 430,998 | |
| College graduate | 114,346 (27%) | |
| Graduate or professional degree | 184,867 (43%) | |
| High school diploma or GED | 16,161 (3.7%) | |
| Less than high school | 9,011 (2.1%) | |
| Some college or technical school | 106,613 (25%) | |
| Smoking status (4 levels) | 410,261 | |
| Current daily smoker | 31,770 (7.7%) | |
| Current some-day smoker | 13,376 (3.3%) | |
| Former smoker | 113,134 (28%) | |
| Never smoker | 251,981 (61%) | |
| Any physical activity (past 30 days) | 432,072 | 325,227 (75%) |
| 1 Mean (SD); n (%) | ||
#Report the number and percentage of missing observations for menthlth_days, bmi, and smoking
total_n <- nrow(brfss_A3)
# Missing summary
missing_summary <- brfss_A3 |>
summarise(
menthlth_miss_n = sum(is.na(menthlth_days)),
menthlth_miss_pct = mean(is.na(menthlth_days)) * 100,
bmi_miss_n = sum(is.na(bmi)),
bmi_miss_pct = mean(is.na(bmi)) * 100,
smoking_miss_n = sum(is.na(smoking)),
smoking_miss_pct = mean(is.na(smoking)) * 100
)
missing_summary
## # A tibble: 1 × 6
## menthlth_miss_n menthlth_miss_pct bmi_miss_n bmi_miss_pct smoking_miss_n
## <int> <dbl> <int> <dbl> <int>
## 1 8108 1.87 40535 9.35 23062
## # ℹ 1 more variable: smoking_miss_pct <dbl>
set.seed(1220)
brfss_analytic <- brfss_A3 |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
nrow(brfss_analytic)
## [1] 8000
brfss_analytic |>
select(menthlth_days, physhlth_days, bmi, age_group,
income, education, smoking, exercise, sex) |>
tbl_summary(
by = sex,
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body mass index",
age_group ~ "Age group",
income ~ "Household income",
exercise ~ "Any physical activity (past 30 days)",
education ~ "Highest level of education completed",
smoking ~ "Smoking status (4 levels)"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_n() |>
# add_overall() |>
bold_labels() |>
#italicize_levels() |>
modify_caption("**Table 1. Descriptive Statistics by Sex — BRFSS 2023 Analytic Sample (n = 8,000)**")
| Characteristic | N | Female N = 4,0641 |
Male N = 3,9361 |
|---|---|---|---|
| Mentally unhealthy days (past 30) | 8,000 | 5.1 (8.6) | 3.5 (7.5) |
| Physically unhealthy days (past 30) | 8,000 | 4.9 (8.9) | 4.0 (8.4) |
| Body mass index | 8,000 | 28.7 (7.0) | 28.7 (6.0) |
| Age group | 8,000 | ||
| 18–24 | 171 (4.2%) | 235 (6.0%) | |
| 25–29 | 189 (4.7%) | 219 (5.6%) | |
| 30–34 | 210 (5.2%) | 253 (6.4%) | |
| 35–39 | 302 (7.4%) | 263 (6.7%) | |
| 40–44 | 292 (7.2%) | 290 (7.4%) | |
| 45–49 | 252 (6.2%) | 266 (6.8%) | |
| 50–54 | 303 (7.5%) | 305 (7.7%) | |
| 55–59 | 352 (8.7%) | 308 (7.8%) | |
| 60–64 | 379 (9.3%) | 408 (10%) | |
| 65–69 | 483 (12%) | 418 (11%) | |
| 70–74 | 426 (10%) | 382 (9.7%) | |
| 75–79 | 338 (8.3%) | 325 (8.3%) | |
| 80+ | 367 (9.0%) | 264 (6.7%) | |
| Household income | 8,000 | ||
| $100,000–$199,999 | 869 (21%) | 996 (25%) | |
| $15,000–$24,999 | 370 (9.1%) | 271 (6.9%) | |
| $200,000 or more | 238 (5.9%) | 400 (10%) | |
| $25,000–$34,999 | 495 (12%) | 376 (9.6%) | |
| $35,000–$49,999 | 585 (14%) | 482 (12%) | |
| $50,000–$99,999 | 1,260 (31%) | 1,251 (32%) | |
| Less than $15,000 | 247 (6.1%) | 160 (4.1%) | |
| Highest level of education completed | 8,000 | ||
| College graduate | 1,120 (28%) | 1,018 (26%) | |
| Graduate or professional degree | 1,896 (47%) | 1,763 (45%) | |
| High school diploma or GED | 122 (3.0%) | 130 (3.3%) | |
| Less than high school | 49 (1.2%) | 75 (1.9%) | |
| Some college or technical school | 877 (22%) | 950 (24%) | |
| Smoking status (4 levels) | 8,000 | ||
| Current daily smoker | 319 (7.8%) | 339 (8.6%) | |
| Current some-day smoker | 117 (2.9%) | 151 (3.8%) | |
| Former smoker | 1,055 (26%) | 1,207 (31%) | |
| Never smoker | 2,573 (63%) | 2,239 (57%) | |
| Any physical activity (past 30 days) | 8,000 | 3,094 (76%) | 3,146 (80%) |
| 1 Mean (SD); n (%) | |||
#Part 1: Multiple Linear Regression
#Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?
#Step 1: Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output
m1 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic)
summary(m1)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.33197 0.67780 13.768 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexMale -1.39038 0.16750 -8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25–29 -1.05660 0.51938 -2.034 0.041950
## age_group30–34 -1.09395 0.50646 -2.160 0.030803
## age_group35–39 -1.81103 0.48851 -3.707 0.000211
## age_group40–44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45–49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50–54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55–59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60–64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65–69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70–74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75–79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000–$24,999 1.86281 0.36432 5.113 3.24e-07
## income$200,000 or more -0.28425 0.34023 -0.835 0.403477
## income$25,000–$34,999 1.42347 0.32097 4.435 9.33e-06
## income$35,000–$49,999 0.95299 0.29630 3.216 0.001304
## income$50,000–$99,999 0.44936 0.23068 1.948 0.051456
## incomeLess than $15,000 3.49984 0.42923 8.154 4.07e-16
## educationGraduate or professional degree -0.05310 0.21116 -0.251 0.801448
## educationHigh school diploma or GED -1.71101 0.49969 -3.424 0.000620
## educationLess than high school -1.79091 0.69119 -2.591 0.009585
## educationSome college or technical school -0.71412 0.23816 -2.998 0.002722
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexMale ***
## exerciseYes **
## age_group25–29 *
## age_group30–34 *
## age_group35–39 ***
## age_group40–44 ***
## age_group45–49 ***
## age_group50–54 ***
## age_group55–59 ***
## age_group60–64 ***
## age_group65–69 ***
## age_group70–74 ***
## age_group75–79 ***
## age_group80+ ***
## income$15,000–$24,999 ***
## income$200,000 or more
## income$25,000–$34,999 ***
## income$35,000–$49,999 **
## income$50,000–$99,999 .
## incomeLess than $15,000 ***
## educationGraduate or professional degree
## educationHigh school diploma or GED ***
## educationLess than high school **
## educationSome college or technical school **
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
#Step 2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places.
menthlth_days = 9.332 + 0.266(physhlth_days) + 0.033(bmi) - 1.390(sexMale) - 0.651(exerciseYes) - 1.057(age_group25–29) - 1.094(age_group30–34)- 1.811(age_group35–39)- 2.893(age_group40–44) - 3.056(age_group45–49)- 3.517(age_group50–54)- 4.496(age_group55–59) - 4.522(age_group60–64)- 5.578(age_group65–69)- 6.025(age_group70–74) - 6.287(age_group75–79)- 6.820(age_group80+)+ 1.863(income $15,000–$24,999)- 0.284(income $200,000 or more)+ 1.423(income $25,000–$34,999)+ 0.953(income $35,000–$49,999)+ 0.449(income $50,000–$99,999)+ 3.500(income Less than $15,000)- 0.053(education Graduate/professional)- 1.711(education High school/GED)-1.791(education Less than high school)- 0.714(education Some college)- 1.587(smoking Current some-day smoker)- 1.990(smoking Former smoker)- 2.937(smoking Never smoker)
#Step 3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant: • physhlth_days (continuous predictor) • bmi (continuous predictor) • sex: Female vs. Male (reference) • exercise: Yes vs. No (reference) • One age_group coefficient of your choice • Two income coefficients of your choice, compared to the reference category (Less than $15,000)
PHYSHLTH_DAYS: For each additonal physically unhealthy days in the past 30 days, the number of mentally unhealthy days positively increased by 0.266 days on average, holding all other variables constant. This means that poor physical health is associated with poor mental health.
BMI: For each 1-unit increase in BMI, mentally unhealthy days positively increases by 0.033 days on average, holding all other variables constant. This means that there is a small effect between higher BMI and mentally unhealthy days.
SEX (female vs. male(reference)) Females have 1.390 more mentally unhealthy days on average than males, holding all other variables constant. This means that being female is associated with more mentally unhealthy days.
EXERCISE (yes vs. no (refernce))
Indivduals who exercised in the past 30 days have 0.651 fewer mentally unhealthy days on average compared to those who do not exercise, hold all other variables constant.This means that exercise is associated with better mental health.
AGE ( 50-54 vs 18-24(reference)):
Individuals aged 50-54 have 3.517 fewer mentally unhealthy days on average compared to those aged 18-24, holding all other variables constant. This means that older individuals report better mental health than younger individuals.
INCOME (Less than $15,000 vs. $25,000–$34,999):
Individuals earning $25,000–$34,999 have about 2.08 fewer mentally unhealthy days on average compared to those earning less than $15,000, holding all other variables constant. This means that higher income individuals are less likely to experience mentally unhealthy days.
INCOME(Less than $15,000 vs. $200,000 or more):
Individuals earning $200,000 or more have about 3.78 fewer mentally unhealthy days on average compared to those earning less than $15,000, holding all other variables constant. This shows us that higher income level is associated with better mental health days. However, this relationship is not statistically significant, with a p-value greater than 0.05. There is not enough evidence to conclude that the relationship is reliable. #Step 4: Report and interpret each of the following model-level statistics: • R-squared: proportion of variance in mentally unhealthy days explained by all predictors • Adjusted R-squared: how it differs from R-squared and what it adds • Root MSE (residual standard error): average prediction error of the model • Overall F-test: state H0, F-statistic, degrees of freedom (numerator and denominator), p-value, and conclusion
R^2: The model has an R^2 of 0.1853, meaning that approximately 18.5% of the variability in mentally unhealthy days is explained by the predictors in the model. The adjusted R^2 is 0.1824, which is slightly lower because it takes into account for the number of predictors and penalizes for model complexity.
Root MSE: The residual stand error is 7.35, showing that on average the models prediction differs from the observed number of mentally unhealthy days by about 7.35 days.
Null hypothesis (H₀): All regression coefficients are equal to zero.
Alternative hypothesis (H₁): At least one regression coefficient is not equal to zero.
DF: 7970 F-statistic: The overall F-test is 62.52. p-value: p < 0.001
Conclusion: We can reject the null hypothesis and conclude that at least one predictor is significantly associated with mentally unhealthy days.
glance(m1) |>
select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual, nobs) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
pivot_longer(everything(), names_to = "Statistic", values_to = "Value") |>
mutate(Statistic = dplyr::recode(Statistic,
"r.squared" = "R²",
"adj.r.squared" = "Adjusted R²",
"sigma" = "Residual Std. Error (Root MSE)",
"statistic" = "F-statistic",
"p.value" = "p-value (overall F-test)",
"df" = "Model df (p)",
"df.residual" = "Residual df (n − p − 1)",
"nobs" = "n (observations)"
)) |>
kable(caption = " Overall Model Summary") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Statistic | Value |
|---|---|
| R² | 0.1853 |
| Adjusted R² | 0.1824 |
| Residual Std. Error (Root MSE) | 7.3516 |
| F-statistic | 62.5234 |
| p-value (overall F-test) | 0.0000 |
| Model df (p) | 29.0000 |
| Residual df (n − p − 1) | 7970.0000 |
| n (observations) | 8000.0000 |
#Part 2: Tests of Hypotheses (20 Points) #Step 1: Obtain Type III (partial) sums of squares for all predictors using car::Anova(model, type = “III”). Display the full output. Identify which predictors have statistically significant partial associations at α = 0.05.
Based on the Type III results, the predictors that are statistically significant with mentally unhealthy days is: physhlth_days, bmi, exercise, age_group, income, education, sex, and smoking. All predictors in the model are statistically significant with a p-value less than 0.05.
anova_type3 <- Anova(m1, type = "III")
anova_type3 |>
as.data.frame() |>
rownames_to_column("Source") |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Type III (Partial) Sums of Squares — car::Anova()",
col.names = c("Source", "Sum of Sq", "df", "F value", "p-value")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Source | Sum of Sq | df | F value | p-value |
|---|---|---|---|---|
| (Intercept) | 10245.0838 | 1 | 189.5608 | 0.0000 |
| physhlth_days | 37622.7952 | 1 | 696.1198 | 0.0000 |
| bmi | 345.2408 | 1 | 6.3879 | 0.0115 |
| sex | 3723.8662 | 1 | 68.9012 | 0.0000 |
| exercise | 497.0434 | 1 | 9.1966 | 0.0024 |
| age_group | 30092.1774 | 12 | 46.3986 | 0.0000 |
| income | 4733.8943 | 6 | 14.5982 | 0.0000 |
| education | 1265.1504 | 4 | 5.8521 | 0.0001 |
| smoking | 5101.1076 | 3 | 31.4613 | 0.0000 |
| Residuals | 430750.0872 | 7970 | NA | NA |
#Step 2 (Chunk test for income): Test whether income collectively improves the model significantly, after accounting for all other predictors: 13.Fit a reduced model that includes all predictors except income.14.Use anova(model_reduced, model_full) to compare the two models. 15.State H0 and HA for this test clearly. 16.Report the F-statistic, degrees of freedom, and p-value. 17.State your conclusion at α = 0.05.
F-statistic: 14.5982 p-value: <0.0001 DF=6 Conclusion: We can reject the null hypothesis because the p-value is less than 0.05. We can conclude that income collectively improves the fit of the model after adjusting for physhlth_days, sleep_hrs, sex, and exercise. Income is an important predictor of mentally unhealthy days in this model.
m_reduced <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + education + smoking,
data = brfss_analytic
)
m_full <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
# Chunk test
anova(m_reduced, m_full) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Reduced(excluding income)", "Full (+ demographics)"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table Chunk Test: Do demographic variables collectively add to the model?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced(excluding income) | 7976 | 435484.0 | NA | NA | NA | NA |
| Full (+ demographics) | 7970 | 430750.1 | 6 | 4733.894 | 14.5982 | 0 |
#Step 3 (Chunk test for education): Repeat the same procedure for education as a group of predictors.
F-statistic: 5.8521 p-value: 0.0001 DF=4 Conclusion: We can reject the null hypothesis because the p-value is less than 0.05. We can conclude that education collectively improves the fit of the model after adjusting for physhlth_days, sleep_hrs, sex, and exercise. Education is statistically significant predictor of mentally unhealthy days in this model.
m_reduced_edu <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + smoking,
data = brfss_analytic
)
m_full <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
# Chunk test
anova(m_reduced_edu, m_full) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Reduced(no education)", "Full (+ demographics)"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table Chunk Test: Do demographic variables collectively add to the model?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced(no education) | 7974 | 432015.2 | NA | NA | NA | NA |
| Full (+ demographics) | 7970 | 430750.1 | 4 | 1265.15 | 5.8521 | 1e-04 |
#Step 4: Write a 3 to 5 sentence synthesis interpreting the Type III results and your chunk test findings. Which predictors make the strongest independent contributions? What do the chunk tests add beyond the individual t-tests from summary()?
The type III results show that several predictors have strong independent associations with mentally unhealthy days, however the strongest contributions come from physically unhealthy days (f = 696.12), age group (F=46.40), and smoking (F=31.46), indicating these factors play an important role in mental health. Smaller but still significant factors are BMI, exercise, income, education, and sex.The chunk test shows that income significantly improves model fit (f-test=14.60, p < 0.001) and education also adds power (f-test=5.85, p=0.0001), showing us that both factors contribute to mental health outcomes. The chunk tests allows us to evaluate whether the full model improves overall fit rather than the reduced model.
#Part 3: Interaction Analysis (25 Points) Background: Effect modification occurs when the association between a predictor and an outcome differs across levels of another variable. You will test whether the association between current smoking and mentally unhealthy days differs between men and women. Step 1: Create a binary smoking variable called smoker_current: • “Current smoker” includes current daily smokers and current some-day smokers. • “Non-smoker” includes former smokers and never smokers. Use “Non-smoker” as the reference category.
brfss_recoded <- brfss_analytic |>
mutate(smoker_current = case_when(
smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
)) |>
mutate(smoker_current = factor(smoker_current, levels = c("Non-smoker", "Current smoker")))
#Step 2: Fit two models: • Model A (main effects only): regress menthlth_days on physhlth_days, bmi, sex, smoker_current, exercise, age_group, income, and education. • Model B (with interaction): same as Model A, but use sex * smoker_current in the formula to include the interaction term.
brfss_recoded$sex <- relevel(brfss_recoded$sex, ref = "Male")
model_A <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education,
data = brfss_recoded)
tidy(model_A, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Interaction Model: Sex + smoker_current → Physical Health Days",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.1228 | 0.6105 | 8.3907 | 0.0000 | 3.9260 | 6.3197 |
| physhlth_days | 0.2686 | 0.0101 | 26.6731 | 0.0000 | 0.2489 | 0.2884 |
| bmi | 0.0334 | 0.0132 | 2.5250 | 0.0116 | 0.0075 | 0.0593 |
| sexFemale | 1.3331 | 0.1673 | 7.9675 | 0.0000 | 1.0051 | 1.6611 |
| smoker_currentCurrent smoker | 2.1287 | 0.2712 | 7.8489 | 0.0000 | 1.5971 | 2.6604 |
| exerciseYes | -0.6725 | 0.2150 | -3.1274 | 0.0018 | -1.0940 | -0.2510 |
| age_group25–29 | -0.9149 | 0.5198 | -1.7602 | 0.0784 | -1.9338 | 0.1040 |
| age_group30–34 | -0.8823 | 0.5061 | -1.7434 | 0.0813 | -1.8743 | 0.1097 |
| age_group35–39 | -1.5810 | 0.4877 | -3.2421 | 0.0012 | -2.5369 | -0.6251 |
| age_group40–44 | -2.6157 | 0.4858 | -5.3847 | 0.0000 | -3.5680 | -1.6635 |
| age_group45–49 | -2.8246 | 0.4970 | -5.6836 | 0.0000 | -3.7988 | -1.8504 |
| age_group50–54 | -3.2600 | 0.4821 | -6.7628 | 0.0000 | -4.2050 | -2.3151 |
| age_group55–59 | -4.2301 | 0.4741 | -8.9219 | 0.0000 | -5.1595 | -3.3007 |
| age_group60–64 | -4.2484 | 0.4568 | -9.3000 | 0.0000 | -5.1439 | -3.3529 |
| age_group65–69 | -5.2338 | 0.4467 | -11.7163 | 0.0000 | -6.1095 | -4.3582 |
| age_group70–74 | -5.7023 | 0.4545 | -12.5457 | 0.0000 | -6.5933 | -4.8113 |
| age_group75–79 | -5.8977 | 0.4703 | -12.5401 | 0.0000 | -6.8197 | -4.9758 |
| age_group80+ | -6.4888 | 0.4737 | -13.6975 | 0.0000 | -7.4174 | -5.5602 |
| income$15,000–$24,999 | 1.8562 | 0.3650 | 5.0855 | 0.0000 | 1.1407 | 2.5717 |
| income$200,000 or more | -0.3265 | 0.3408 | -0.9581 | 0.3380 | -0.9946 | 0.3415 |
| income$25,000–$34,999 | 1.4337 | 0.3214 | 4.4605 | 0.0000 | 0.8036 | 2.0637 |
| income$35,000–$49,999 | 0.9491 | 0.2969 | 3.1971 | 0.0014 | 0.3672 | 1.5310 |
| income$50,000–$99,999 | 0.4537 | 0.2311 | 1.9632 | 0.0497 | 0.0007 | 0.9067 |
| incomeLess than $15,000 | 3.5360 | 0.4300 | 8.2232 | 0.0000 | 2.6931 | 4.3789 |
| educationGraduate or professional degree | -0.1579 | 0.2106 | -0.7496 | 0.4535 | -0.5706 | 0.2549 |
| educationHigh school diploma or GED | -1.6896 | 0.5006 | -3.3750 | 0.0007 | -2.6710 | -0.7083 |
| educationLess than high school | -1.9035 | 0.6922 | -2.7499 | 0.0060 | -3.2604 | -0.5466 |
| educationSome college or technical school | -0.7070 | 0.2385 | -2.9636 | 0.0030 | -1.1746 | -0.2393 |
model_B <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education,
data = brfss_recoded)
tidy(model_B, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Interaction Model: Sex * smoker_current → Physical Health Days",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 5.2075 | 0.6113 | 8.5189 | 0.0000 | 4.0092 | 6.4058 |
| physhlth_days | 0.2686 | 0.0101 | 26.6788 | 0.0000 | 0.2489 | 0.2883 |
| bmi | 0.0331 | 0.0132 | 2.5017 | 0.0124 | 0.0072 | 0.0590 |
| sexFemale | 1.1855 | 0.1775 | 6.6784 | 0.0000 | 0.8376 | 1.5335 |
| smoker_currentCurrent smoker | 1.5208 | 0.3654 | 4.1621 | 0.0000 | 0.8045 | 2.2371 |
| exerciseYes | -0.6728 | 0.2150 | -3.1301 | 0.0018 | -1.0942 | -0.2515 |
| age_group25–29 | -0.9202 | 0.5196 | -1.7709 | 0.0766 | -1.9388 | 0.0984 |
| age_group30–34 | -0.8924 | 0.5059 | -1.7640 | 0.0778 | -1.8841 | 0.0993 |
| age_group35–39 | -1.5929 | 0.4875 | -3.2673 | 0.0011 | -2.5485 | -0.6372 |
| age_group40–44 | -2.6286 | 0.4856 | -5.4127 | 0.0000 | -3.5806 | -1.6766 |
| age_group45–49 | -2.8425 | 0.4969 | -5.7209 | 0.0000 | -3.8165 | -1.8686 |
| age_group50–54 | -3.2778 | 0.4820 | -6.8012 | 0.0000 | -4.2226 | -2.3331 |
| age_group55–59 | -4.2499 | 0.4740 | -8.9652 | 0.0000 | -5.1791 | -3.3206 |
| age_group60–64 | -4.2640 | 0.4567 | -9.3364 | 0.0000 | -5.1593 | -3.3688 |
| age_group65–69 | -5.2506 | 0.4466 | -11.7563 | 0.0000 | -6.1261 | -4.3751 |
| age_group70–74 | -5.7111 | 0.4544 | -12.5686 | 0.0000 | -6.6018 | -4.8203 |
| age_group75–79 | -5.9076 | 0.4702 | -12.5646 | 0.0000 | -6.8292 | -4.9859 |
| age_group80+ | -6.4995 | 0.4736 | -13.7239 | 0.0000 | -7.4278 | -5.5711 |
| income$15,000–$24,999 | 1.8740 | 0.3650 | 5.1348 | 0.0000 | 1.1586 | 2.5894 |
| income$200,000 or more | -0.3307 | 0.3407 | -0.9708 | 0.3317 | -0.9986 | 0.3371 |
| income$25,000–$34,999 | 1.4352 | 0.3213 | 4.4666 | 0.0000 | 0.8053 | 2.0650 |
| income$35,000–$49,999 | 0.9642 | 0.2968 | 3.2485 | 0.0012 | 0.3824 | 1.5461 |
| income$50,000–$99,999 | 0.4667 | 0.2311 | 2.0198 | 0.0434 | 0.0138 | 0.9197 |
| incomeLess than $15,000 | 3.5097 | 0.4300 | 8.1623 | 0.0000 | 2.6668 | 4.3526 |
| educationGraduate or professional degree | -0.1488 | 0.2105 | -0.7069 | 0.4797 | -0.5615 | 0.2639 |
| educationHigh school diploma or GED | -1.6923 | 0.5005 | -3.3815 | 0.0007 | -2.6734 | -0.7113 |
| educationLess than high school | -1.8179 | 0.6928 | -2.6239 | 0.0087 | -3.1760 | -0.4598 |
| educationSome college or technical school | -0.7000 | 0.2385 | -2.9351 | 0.0033 | -1.1675 | -0.2325 |
| sexFemale:smoker_currentCurrent smoker | 1.2833 | 0.5171 | 2.4819 | 0.0131 | 0.2697 | 2.2968 |
#Step 3: Test whether the interaction is statistically significant: 18.Use anova(model_A, model_B) to compare the two models. 19.State H0 and HA for this test. 20.Report the F-statistic, degrees of freedom, and p-value. 21.State your conclusion at α = 0.05.
H₀: There is no interaction between sex and smoking. Hₐ: There is an interaction between sex and smoking.
F-statistic: 6.16 Degrees of freedom: 1 p-value: 0.0131
Conclusion: The p-value is less than 0.05, we can reject the null hypothesis.This shows us that there is an association between smoking and mentally unhealthy days and differ by sex, showing a statistically significant interaction.
anova(model_A, model_B) |>
as.data.frame() |>
rownames_to_column("Model") |>
mutate(
Model = c("Reduced (no interaction)", "Full (with sex × smoking)"),
across(where(is.numeric), ~ round(., 4))
) |>
kable(
caption = "Table 6. Partial F-Test: Does the sex × smoking interaction improve model fit?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (no interaction) | 7972 | 432508.9 | NA | NA | NA | NA |
| Full (with sex × smoking) | 7971 | 432174.9 | 1 | 333.9749 | 6.1598 | 0.0131 |
#Step 4: Interpret the interaction using the coefficients from Model B: 22.Report the estimated effect of current smoking on mentally unhealthy days among men (the coefficient for smoker_current). Interpret in plain language.
Among men, current smokers have 1.5208 more mentally unhealthy days on average compared to non-smokers, holding all other variables constant.
23.Compute and report the estimated effect among women (smoker_current coefficient plus the interaction coefficient). Interpret in plain language.
Among women, current smokers have 1.2833 more mentally unhealthy days on average compared to non-smokers, holding all other variables constant.
24.In 2 to 3 sentences, explain what these estimates mean together: is smoking more strongly associated with poor mental health in one sex than the other, and by how much?
Smoking is associated with more mentally unhealthy days in both men and women. However the effect of smoking is weaker among women than men by about 0.24 days. This suggests that while smoking is harmful for mental health in both groups, its impact is stronger in men.
#Step 5: Visualize the interaction using ggeffects::ggpredict(model_B, terms = c(“smoker_current”, “sex”)). The plot should show predicted mentally unhealthy days for each combination of smoking status and sex, with labeled axes, a descriptive title, and a legend identifying men and women.
brfss_recoded <- ggpredict(model_B, terms = c("smoker_current", "sex"))
plot(brfss_recoded) + labs(
x= "Smoking Status",
y= "Mentally Unhealthy Days",
title = "Interaction Between Smoking Status and Sex on Mental Health"
)
#Step 6: Write 3 to 5 sentences interpreting the interaction for a general public health audience. Do not include coefficient values or p-values. Focus on the substantive finding and its implications.
Smoking is associated with worse mental health with current smokers reporting more mentally unhealthy days than non-smokers. The relationship differs between men and women, showing that sex plays a role in how smoking impacts mental health. The results show the importance of looking at smoking and mental health by sex and by looking at both behaviors we could improve overall health.
#Part 4: Reflection (15 Points) Write a focused reflection of 250 to 400 words in continuous prose (not as a bulleted list) addressing all three topics below.
A. Findings and Limitations (6 points) What do the results suggest about the determinants of mental health among U.S. adults? Which predictors were most strongly associated? Which were not statistically significant? What are the key limitations of using cross-sectional survey data? Name at least two specific confounders that could bias the associations you estimated.
B. From Simple to Multiple Regression (4 points) What does Adjusted R-squared tell you that regular R-squared does not, and why is this distinction especially important when adding multiple categorical predictors? Why is it useful to test groups of predictors (income, education) collectively with chunk tests, rather than relying only on the individual t-test for each level?
C. AI Transparency (5 points) Describe which parts of the assignment (if any) you sought AI assistance for, what specific prompts you used, and how you verified the AI-assisted output was correct. If you did not use AI, state that explicitly.
The results suggest that mental health among adults is influenced by a combination of physical health, demographic, and behavioral factors. The largest predictor was physically unhealthy days, which showed a large association with mentally unhealthy days. Age also showed a strong pattern, with older individuals reporting fewer mental health days compared to younger adults. Smoking also showed an association with worse mental health and showed differences among men and women. BMI and exercise had a statistically significant effect but it was very small. All predictors were significant but some were stronger than others. There are important limitations to these findings. The data is cross-sectional which means we cannot establish causality. For example, poor mental health could lead to less physical activity rather than less physical activity leading to poor mental health. Additionally, unmeasured confounding may bias the results. These confounders could be healthcare and social support which may affect mental health. In addition, self-reported data may also arise measurement error. In regards to adjusted R^2, it provides a more accurate measure of model fit than R^2 because it accounts for the number of predictors in the model. Adjusted R^2 penalizes unnecessary complexity while R^2 does not, making adjusted R^2 essential when including predictors. This helps prevent over fitting and ensures that predictors being added are improving the model fit. Testing groups of predictors using chunk tests is useful because it evaluates whether a variable contributes to the model as a whole. Individual tests may fail to detect significance in the whole model if effects are spread across categories. Chunk tests can reveal the overall importance of a variable and helps shows it role in the model. For this assignment, I used AI to help debug by code. The code I used was from the lecture notes but when changing variables or model names, I encounter errors. I would input my code into AI to help determine where my error was. For example, in part 3 section 2 I had tidy(mod_int,conf.int = TRUE)realize I had not changed it to match the name of the model I was using which was model_B so I changed the code to (model_B, conf.int = TRUE). Since my code chunk came from the notes, I knew the code was mostly right, I just had an error with variable names,model names or adding commons/plus signs in certain areas, so I was able to check if the code was correct by running it again and getting the output with no errors.