Submission: Knit this file to HTML, publish to RPubs with the title
epi553_hw03_lastname_firstname, and paste the RPubs URL in the Brightspace submission comments. Also upload this.Rmdfile to Brightspace.
# Load required packages
library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)## [1] 433323 350
## [1] 433323
## [1] 350
# Optional narrative output
cat("The raw BRFSS dataset contains", nrow(brfss_raw), "rows and", ncol(brfss_raw), "columns.\n")## The raw BRFSS dataset contains 433323 rows and 350 columns.
# Select the nine required variables
brfss <- brfss_raw %>%
select(
MENTHLTH,
PHYSHLTH,
`_BMI5`,
SEXVAR,
EXERANY2,
`_AGEG5YR`,
`_INCOMG1`,
EDUCA,
`_SMOKER3`
)
# Recode all variables
brfss_clean <- brfss %>%
mutate(
# Mentally unhealthy days
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH %in% c(77, 99) ~ NA_real_,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_
),
# Physically unhealthy days
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH %in% c(77, 99) ~ NA_real_,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_
),
# BMI
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_,
TRUE ~ as.numeric(`_BMI5`) / 100
),
# Sex
sex = case_when(
SEXVAR == 1 ~ "Male",
SEXVAR == 2 ~ "Female",
TRUE ~ NA_character_
),
sex = factor(sex, levels = c("Male", "Female")),
# Exercise
exercise = case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No",
EXERANY2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
exercise = factor(exercise, levels = c("No", "Yes")),
# Age group
age_group = case_when(
`_AGEG5YR` == 1 ~ "18-24",
`_AGEG5YR` == 2 ~ "25-29",
`_AGEG5YR` == 3 ~ "30-34",
`_AGEG5YR` == 4 ~ "35-39",
`_AGEG5YR` == 5 ~ "40-44",
`_AGEG5YR` == 6 ~ "45-49",
`_AGEG5YR` == 7 ~ "50-54",
`_AGEG5YR` == 8 ~ "55-59",
`_AGEG5YR` == 9 ~ "60-64",
`_AGEG5YR` == 10 ~ "65-69",
`_AGEG5YR` == 11 ~ "70-74",
`_AGEG5YR` == 12 ~ "75-79",
`_AGEG5YR` == 13 ~ "80+",
`_AGEG5YR` == 14 ~ NA_character_,
TRUE ~ NA_character_
),
age_group = factor(
age_group,
levels = c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49",
"50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+")
),
# Income
income = case_when(
`_INCOMG1` == 1 ~ "Less than $15,000",
`_INCOMG1` == 2 ~ "$15,000 to < $25,000",
`_INCOMG1` == 3 ~ "$25,000 to < $35,000",
`_INCOMG1` == 4 ~ "$35,000 to < $50,000",
`_INCOMG1` == 5 ~ "$50,000 to < $100,000",
`_INCOMG1` == 6 ~ "$100,000 to < $200,000",
`_INCOMG1` == 7 ~ "$200,000 or more",
`_INCOMG1` == 9 ~ NA_character_,
TRUE ~ NA_character_
),
income = factor(
income,
levels = c("Less than $15,000",
"$15,000 to < $25,000",
"$25,000 to < $35,000",
"$35,000 to < $50,000",
"$50,000 to < $100,000",
"$100,000 to < $200,000",
"$200,000 or more")
),
# Education
education = case_when(
EDUCA %in% c(1, 2) ~ "Less than high school",
EDUCA == 3 ~ "High school diploma or GED",
EDUCA == 4 ~ "Some college or technical school",
EDUCA == 5 ~ "College graduate",
EDUCA == 6 ~ "Graduate or professional degree",
EDUCA == 9 ~ NA_character_,
TRUE ~ NA_character_
),
education = factor(
education,
levels = c("Less than high school",
"High school diploma or GED",
"Some college or technical school",
"College graduate",
"Graduate or professional degree")
),
# Smoking
smoking = case_when(
`_SMOKER3` == 1 ~ "Current daily smoker",
`_SMOKER3` == 2 ~ "Current some-day smoker",
`_SMOKER3` == 3 ~ "Former smoker",
`_SMOKER3` == 4 ~ "Never smoker",
`_SMOKER3` == 9 ~ NA_character_,
TRUE ~ NA_character_
),
smoking = factor(
smoking,
levels = c("Current daily smoker",
"Current some-day smoker",
"Former smoker",
"Never smoker")
)
) %>%
select(
menthlth_days,
physhlth_days,
bmi,
sex,
exercise,
age_group,
income,
education,
smoking
)
# Quick structure check
glimpse(brfss_clean)## Rows: 433,323
## Columns: 9
## $ menthlth_days <dbl> 0, 0, 2, 0, 0, 3, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, …
## $ physhlth_days <dbl> 0, 0, 6, 2, 0, 2, 8, 1, 5, 0, 0, 2, 0, 0, 0, 4, 30, 15, …
## $ bmi <dbl> 30.47, 28.56, 22.31, 27.44, 25.85, 30.18, 24.41, 27.27, …
## $ sex <fct> Female, Female, Female, Female, Female, Female, Male, Fe…
## $ exercise <fct> No, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, Yes, No, Y…
## $ age_group <fct> 80+, 80+, 80+, 75-79, 75-79, 60-64, 80+, 75-79, 80+, 75-…
## $ income <fct> NA, NA, "Less than $15,000", NA, "$50,000 to < $100,000"…
## $ education <fct> College graduate, College graduate, Some college or tech…
## $ smoking <fct> Never smoker, Never smoker, Former smoker, Never smoker,…
# Report number and percentage missing for menthlth_days
# and at least two other key variables, BEFORE removing missingness
missingness_table <- brfss_clean %>%
summarize(
menthlth_days_missing_n = sum(is.na(menthlth_days)),
menthlth_days_missing_pct = mean(is.na(menthlth_days)) * 100,
physhlth_days_missing_n = sum(is.na(physhlth_days)),
physhlth_days_missing_pct = mean(is.na(physhlth_days)) * 100,
bmi_missing_n = sum(is.na(bmi)),
bmi_missing_pct = mean(is.na(bmi)) * 100,
smoking_missing_n = sum(is.na(smoking)),
smoking_missing_pct = mean(is.na(smoking)) * 100
)
missingness_table## # A tibble: 1 × 8
## menthlth_days_missing_n menthlth_days_missing_pct physhlth_days_missing_n
## <int> <dbl> <int>
## 1 8108 1.87 10785
## # ℹ 5 more variables: physhlth_days_missing_pct <dbl>, bmi_missing_n <int>,
## # bmi_missing_pct <dbl>, smoking_missing_n <int>, smoking_missing_pct <dbl>
# Optional prettier missingness table
missingness_long <- tibble(
variable = c("menthlth_days", "physhlth_days", "bmi", "smoking"),
missing_n = c(
sum(is.na(brfss_clean$menthlth_days)),
sum(is.na(brfss_clean$physhlth_days)),
sum(is.na(brfss_clean$bmi)),
sum(is.na(brfss_clean$smoking))
),
missing_pct = c(
mean(is.na(brfss_clean$menthlth_days)) * 100,
mean(is.na(brfss_clean$physhlth_days)) * 100,
mean(is.na(brfss_clean$bmi)) * 100,
mean(is.na(brfss_clean$smoking)) * 100
)
) %>%
mutate(missing_pct = round(missing_pct, 2))
missingness_long %>%
kbl(caption = "Missingness Before Creating Analytic Dataset",
col.names = c("Variable", "Missing N", "Missing %")) %>%
kable_classic(full_width = FALSE)| Variable | Missing N | Missing % |
|---|---|---|
| menthlth_days | 8108 | 1.87 |
| physhlth_days | 10785 | 2.49 |
| bmi | 40535 | 9.35 |
| smoking | 23062 | 5.32 |
# Create analytic dataset
set.seed(1220)
brfss_analytic <- brfss_clean %>%
drop_na(
menthlth_days,
physhlth_days,
bmi,
sex,
exercise,
age_group,
income,
education,
smoking
) %>%
slice_sample(n = 8000)
# Report final analytic n
nrow(brfss_analytic)## [1] 8000
## Final analytic sample size: 8000
# 12. Produce descriptive statistics table stratified by sex
table1 <- brfss_analytic %>%
tbl_summary(
by = sex,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 2,
missing = "no"
) %>%
add_overall() %>%
bold_labels()
table1| Characteristic | Overall N = 8,0001 |
Male N = 3,9361 |
Female N = 4,0641 |
|---|---|---|---|
| menthlth_days | 4.32 (8.13) | 3.53 (7.53) | 5.08 (8.60) |
| physhlth_days | 4.43 (8.69) | 3.98 (8.41) | 4.86 (8.94) |
| bmi | 28.70 (6.50) | 28.71 (5.97) | 28.69 (6.98) |
| exercise | 6,240 (78%) | 3,146 (80%) | 3,094 (76%) |
| age_group | |||
| 18-24 | 406 (5.1%) | 235 (6.0%) | 171 (4.2%) |
| 25-29 | 408 (5.1%) | 219 (5.6%) | 189 (4.7%) |
| 30-34 | 463 (5.8%) | 253 (6.4%) | 210 (5.2%) |
| 35-39 | 565 (7.1%) | 263 (6.7%) | 302 (7.4%) |
| 40-44 | 582 (7.3%) | 290 (7.4%) | 292 (7.2%) |
| 45-49 | 518 (6.5%) | 266 (6.8%) | 252 (6.2%) |
| 50-54 | 608 (7.6%) | 305 (7.7%) | 303 (7.5%) |
| 55-59 | 660 (8.3%) | 308 (7.8%) | 352 (8.7%) |
| 60-64 | 787 (9.8%) | 408 (10%) | 379 (9.3%) |
| 65-69 | 901 (11%) | 418 (11%) | 483 (12%) |
| 70-74 | 808 (10%) | 382 (9.7%) | 426 (10%) |
| 75-79 | 663 (8.3%) | 325 (8.3%) | 338 (8.3%) |
| 80+ | 631 (7.9%) | 264 (6.7%) | 367 (9.0%) |
| income | |||
| Less than $15,000 | 407 (5.1%) | 160 (4.1%) | 247 (6.1%) |
| $15,000 to < $25,000 | 641 (8.0%) | 271 (6.9%) | 370 (9.1%) |
| $25,000 to < $35,000 | 871 (11%) | 376 (9.6%) | 495 (12%) |
| $35,000 to < $50,000 | 1,067 (13%) | 482 (12%) | 585 (14%) |
| $50,000 to < $100,000 | 2,511 (31%) | 1,251 (32%) | 1,260 (31%) |
| $100,000 to < $200,000 | 1,865 (23%) | 996 (25%) | 869 (21%) |
| $200,000 or more | 638 (8.0%) | 400 (10%) | 238 (5.9%) |
| education | |||
| Less than high school | 124 (1.6%) | 75 (1.9%) | 49 (1.2%) |
| High school diploma or GED | 252 (3.2%) | 130 (3.3%) | 122 (3.0%) |
| Some college or technical school | 1,827 (23%) | 950 (24%) | 877 (22%) |
| College graduate | 2,138 (27%) | 1,018 (26%) | 1,120 (28%) |
| Graduate or professional degree | 3,659 (46%) | 1,763 (45%) | 1,896 (47%) |
| smoking | |||
| Current daily smoker | 658 (8.2%) | 339 (8.6%) | 319 (7.8%) |
| Current some-day smoker | 268 (3.4%) | 151 (3.8%) | 117 (2.9%) |
| Former smoker | 2,262 (28%) | 1,207 (31%) | 1,055 (26%) |
| Never smoker | 4,812 (60%) | 2,239 (57%) | 2,573 (63%) |
| 1 Mean (SD); n (%) | |||
# Save cleaned datasets so you do not have to reload the XPT every time
saveRDS(brfss_clean, "brfss_clean.rds")
saveRDS(brfss_analytic, "brfss_analytic.rds")Before removing missing values, there were 8,108 missing observations (1.87%) for menthlth_days.
For physhlth_days, there were 10,785 missing observations (2.49%).
For bmi, there were 40,535 missing observations (9.35%), representing the highest proportion of missing data among the variables examined.
Additionally, smoking had 23,062 missing observations (5.32%).
The analytic dataset was created by removing observations with missing values on all nine analysis variables using drop_na(), followed by drawing a random sample of 8,000 observations using slice_sample() with set.seed(1220) to ensure reproducibility. The final analytic sample size was n = 8,000.
Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?
# =========================================================
# Part 1: Multiple Linear Regression
# =========================================================
# 1. Fit the model
model <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
# Display full model output
summary(model)##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65053 0.95407 10.115 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000 to < $25,000 -1.63703 0.46899 -3.491 0.000485
## income$25,000 to < $35,000 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000 to < $50,000 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000 to < $100,000 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000 to < $200,000 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to < $25,000 ***
## income$25,000 to < $35,000 ***
## income$35,000 to < $50,000 ***
## income$50,000 to < $100,000 ***
## income$100,000 to < $200,000 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
# Step 2. Extract coefficients (for regression equation)
#Rounded coefficients for equation
coef_table <- round(coef(model), 3)
coef_table## (Intercept)
## 9.651
## physhlth_days
## 0.266
## bmi
## 0.033
## sexFemale
## 1.390
## exerciseYes
## -0.651
## age_group25-29
## -1.057
## age_group30-34
## -1.094
## age_group35-39
## -1.811
## age_group40-44
## -2.893
## age_group45-49
## -3.056
## age_group50-54
## -3.517
## age_group55-59
## -4.496
## age_group60-64
## -4.522
## age_group65-69
## -5.578
## age_group70-74
## -6.025
## age_group75-79
## -6.287
## age_group80+
## -6.820
## income$15,000 to < $25,000
## -1.637
## income$25,000 to < $35,000
## -2.076
## income$35,000 to < $50,000
## -2.547
## income$50,000 to < $100,000
## -3.050
## income$100,000 to < $200,000
## -3.500
## income$200,000 or more
## -3.784
## educationHigh school diploma or GED
## 0.080
## educationSome college or technical school
## 1.077
## educationCollege graduate
## 1.791
## educationGraduate or professional degree
## 1.738
## smokingCurrent some-day smoker
## -1.587
## smokingFormer smoker
## -1.990
## smokingNever smoker
## -2.937
The fitted regression equation is:
menthlth_days = 9.651 + 0.266(physhlth_days) + 0.033(bmi) + 1.390(Female) − 0.651(Exercise: Yes) − 1.057(age 25–29) − 1.094(age 30–34) − 1.811(age 35–39) − 2.893(age 40–44) − 3.056(age 45–49) − 3.517(age 50–54) − 4.496(age 55–59) − 4.522(age 60–64) − 5.578(age 65–69) − 6.025(age 70–74) − 6.287(age 75–79) − 6.820(age 80+) − 1.637($15k–<$25k) − 2.076($25k–<$35k) − 2.547($35k–<$50k) − 3.050($50k–<$100k) − 3.500($100k–<$200k) − 3.784($200k+)
0.080(HS/GED) + 1.077(Some college) + 1.791(College graduate) + 1.738(Graduate degree) − 1.990(Current some-day smoker) − 1.587(Former smoker) − 2.937(Never smoker)
Coefficients:
physhlth_days: For each additional physically unhealthy day, the number of mentally unhealthy days is expected to increase by 0.266 days, holding all other variables constant.
bmi: For each one-unit increase in BMI, mentally unhealthy days increase by 0.033 days, holding all other variables constant.
sex (Female vs Male): Compared to males, females report 1.390 more mentally unhealthy days, holding all other variables constant.
exercise (Yes vs No): Individuals who exercised reported 0.651 fewer mentally unhealthy days compared to those who did not exercise, holding all other variables constant.
age_group (example: 25–29): Compared to adults aged 18–24, individuals aged 25–29 report 1.057 fewer mentally unhealthy days, holding all other variables constant.
income (example 1: $50k–$100k): Compared to individuals earning less than $15,000, those earning $50,000 to < $100,000 report 3.050 fewer mentally unhealthy days, holding all other variables constant.
income (example 2: $200k+): Compared to individuals earning less than $15,000, those earning $200,000 or more report 3.784 fewer mentally unhealthy days, holding all other variables constant.
R-squared
The R-squared value is 0.185, indicating that approximately 18.5% of the variability in mentally unhealthy days is explained by the predictors included in the model.
Adjusted R-squared
The adjusted R-squared is 0.182, which is slightly lower than the R-squared because it accounts for the number of predictors in the model. This provides a more accurate estimate of model fit by penalizing the inclusion of unnecessary variables.
Root MSE (Residual Standard Error)
The residual standard error (Root MSE) is 7.352, meaning that the model’s predicted mentally unhealthy days differ from the observed values by about 7.35 days on average.
Overall F-test
Hypotheses:
The null hypothesis (H₀) is that all regression coefficients are equal to zero (i.e., none of the predictors are associated with mentally unhealthy days). The alternative hypothesis (H₁) is that at least one predictor is associated with mentally unhealthy days.
Test statistic:
The overall F-test was F(29, 7970) = 62.52, with a p-value < 0.001.
Conclusion:
Since the p-value is less than 0.05, we reject the null hypothesis and conclude that the model provides a statistically significant fit. This indicates that at least one predictor is significantly associated with mentally unhealthy days. However, while the model explains a meaningful portion of variation in mental health, substantial unexplained variability remains, suggesting that other important factors are not captured.
# =========================================================
# Part 2: Tests of Hypotheses
# =========================================================
library(car)
# Full model from Part 1
model_full <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
# ---------------------------------------------------------
# Step 1: Type III (partial) sums of squares
# ---------------------------------------------------------
type3_results <- car::Anova(model_full, type = "III")
type3_results## Anova Table (Type III tests)
##
## Response: menthlth_days
## Sum Sq Df F value Pr(>F)
## (Intercept) 5530 1 102.3152 < 2.2e-16 ***
## physhlth_days 37623 1 696.1198 < 2.2e-16 ***
## bmi 345 1 6.3879 0.0115095 *
## sex 3724 1 68.9012 < 2.2e-16 ***
## exercise 497 1 9.1966 0.0024325 **
## age_group 30092 12 46.3986 < 2.2e-16 ***
## income 4734 6 14.5982 < 2.2e-16 ***
## education 1265 4 5.8521 0.0001064 ***
## smoking 5101 3 31.4613 < 2.2e-16 ***
## Residuals 430750 7970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Optional: identify significant predictors at alpha = 0.05
type3_df <- as.data.frame(type3_results)
type3_df$Predictor <- rownames(type3_df)
type3_sig <- type3_df %>%
dplyr::filter(`Pr(>F)` < 0.05)
type3_sig## Sum Sq Df F value Pr(>F) Predictor
## (Intercept) 5529.7722 1 102.315207 6.599585e-24 (Intercept)
## physhlth_days 37622.7952 1 696.119831 3.785884e-147 physhlth_days
## bmi 345.2408 1 6.387855 1.150955e-02 bmi
## sex 3723.8662 1 68.901236 1.205544e-16 sex
## exercise 497.0434 1 9.196599 2.432451e-03 exercise
## age_group 30092.1774 12 46.398647 1.377631e-107 age_group
## income 4733.8943 6 14.598232 1.192413e-16 income
## education 1265.1504 4 5.852145 1.064433e-04 education
## smoking 5101.1076 3 31.461265 3.285702e-20 smoking
# ---------------------------------------------------------
# Step 2: Chunk test for income
# Reduced model excludes income
# ---------------------------------------------------------
model_no_income <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + education + smoking,
data = brfss_analytic
)
income_chunk_test <- anova(model_no_income, model_full)
income_chunk_test## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7976 435484
## 2 7970 430750 6 4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ---------------------------------------------------------
# Step 3: Chunk test for education
# Reduced model excludes education
# ---------------------------------------------------------
model_no_education <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + smoking,
data = brfss_analytic
)
education_chunk_test <- anova(model_no_education, model_full)
education_chunk_test## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7974 432015
## 2 7970 430750 4 1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Type III sums of squares table was used to assess the partial association of each predictor with mentally unhealthy days, controlling for all other variables in the model. At α = 0.05, the following predictors had statistically significant partial associations with mentally unhealthy days: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking (all p-values < 0.05).
Hypotheses
H₀: All income coefficients are equal to 0 after accounting for the other predictors.
Hₐ: At least one income coefficient is not equal to 0 after accounting for the other predictors.
The chunk test comparing the reduced model (without income) to the full model yielded F(6, 7970) = 14.60, with p < 0.001.
Since the p-value is less than 0.05, we reject the null hypothesis and conclude that income as a group significantly improves the model after accounting for the other predictors.
Hypotheses
H₀: All education coefficients are equal to 0 after accounting for the other predictors. Hₐ: At least one education coefficient is not equal to 0 after accounting for the other predictors.
Test results
The chunk test comparing the reduced model (without education) to the full model yielded F(4, 7970) = 5.85, with p < 0.001.
Conclusion
Since the p-value is less than 0.05, we reject the null hypothesis and conclude that education as a group significantly improves the model after accounting for the other predictors.
The Type III results indicate that all predictors, including physical health days, BMI, sex, exercise, age group, income, education, and smoking, have statistically significant partial associations with mentally unhealthy days after adjusting for other variables. Among these, physhlth_days and age_group appear to make the strongest independent contributions based on their large F-statistics. The chunk test for income demonstrated that income as a group significantly improves model fit beyond the other predictors, and the chunk test for education showed a similar significant contribution. These chunk tests provide additional insight beyond individual t-tests by evaluating whether groups of related variables jointly contribute to the model, even if some individual categories may not be significant on their own. Overall, physical health and smoking emerged as the strongest independent predictors, while education showed weaker and less consistent associations after adjustment.
Background: Effect modification occurs when the association between a predictor and an outcome differs across levels of another variable. You will test whether the association between current smoking and mentally unhealthy days differs between men and women.
# =========================================================
# Part 3: Interaction Analysis
# =========================================================
library(tidyverse)
library(ggeffects)
# Step 1: Create binary smoking variable
brfss_analytic <- brfss_analytic %>%
mutate(
smoker_current = case_when(
smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
),
smoker_current = factor(smoker_current, levels = c("Non-smoker", "Current smoker"))
)
# Check counts
table(brfss_analytic$smoker_current, useNA = "ifany")##
## Non-smoker Current smoker
## 7074 926
# Step 2: Fit Model A (main effects only)
model_A <- lm(
menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
summary(model_A)##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2913 -3.8732 -1.6219 0.6681 30.4937
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.75533 0.92706 7.287 3.48e-13
## physhlth_days 0.26862 0.01007 26.673 < 2e-16
## bmi 0.03341 0.01323 2.525 0.011589
## sexFemale 1.33309 0.16732 7.967 1.84e-15
## smoker_currentCurrent smoker 2.12874 0.27121 7.849 4.74e-15
## exerciseYes -0.67248 0.21503 -3.127 0.001770
## age_group25-29 -0.91492 0.51978 -1.760 0.078412
## age_group30-34 -0.88226 0.50606 -1.743 0.081301
## age_group35-39 -1.58102 0.48765 -3.242 0.001191
## age_group40-44 -2.61572 0.48577 -5.385 7.46e-08
## age_group45-49 -2.82462 0.49698 -5.684 1.37e-08
## age_group50-54 -3.26004 0.48206 -6.763 1.45e-11
## age_group55-59 -4.23010 0.47413 -8.922 < 2e-16
## age_group60-64 -4.24840 0.45682 -9.300 < 2e-16
## age_group65-69 -5.23383 0.44671 -11.716 < 2e-16
## age_group70-74 -5.70233 0.45453 -12.546 < 2e-16
## age_group75-79 -5.89774 0.47031 -12.540 < 2e-16
## age_group80+ -6.48879 0.47372 -13.697 < 2e-16
## income$15,000 to < $25,000 -1.67974 0.46981 -3.575 0.000352
## income$25,000 to < $35,000 -2.10230 0.44863 -4.686 2.83e-06
## income$35,000 to < $50,000 -2.58691 0.43898 -5.893 3.95e-09
## income$50,000 to < $100,000 -3.08227 0.41140 -7.492 7.50e-14
## income$100,000 to < $200,000 -3.53598 0.43000 -8.223 2.30e-16
## income$200,000 or more -3.86251 0.50112 -7.708 1.43e-14
## educationHigh school diploma or GED 0.21386 0.81186 0.263 0.792237
## educationSome college or technical school 1.19653 0.69073 1.732 0.083264
## educationCollege graduate 1.90349 0.69219 2.750 0.005974
## educationGraduate or professional degree 1.74564 0.69382 2.516 0.011890
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## smoker_currentCurrent smoker ***
## exerciseYes **
## age_group25-29 .
## age_group30-34 .
## age_group35-39 **
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to < $25,000 ***
## income$25,000 to < $35,000 ***
## income$35,000 to < $50,000 ***
## income$50,000 to < $100,000 ***
## income$100,000 to < $200,000 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school .
## educationCollege graduate **
## educationGraduate or professional degree *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.366 on 7972 degrees of freedom
## Multiple R-squared: 0.182, Adjusted R-squared: 0.1792
## F-statistic: 65.7 on 27 and 7972 DF, p-value: < 2.2e-16
# Fit Model B (with interaction)
model_B <- lm(
menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
summary(model_B)##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.337 -3.837 -1.604 0.628 30.426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.89938 0.92857 7.430 1.20e-13
## physhlth_days 0.26859 0.01007 26.679 < 2e-16
## bmi 0.03309 0.01323 2.502 0.012380
## sexFemale 1.18553 0.17752 6.678 2.58e-11
## smoker_currentCurrent smoker 1.52079 0.36539 4.162 3.19e-05
## exerciseYes -0.67285 0.21496 -3.130 0.001753
## age_group25-29 -0.92018 0.51962 -1.771 0.076616
## age_group30-34 -0.89242 0.50591 -1.764 0.077774
## age_group35-39 -1.59287 0.48752 -3.267 0.001090
## age_group40-44 -2.62864 0.48564 -5.413 6.39e-08
## age_group45-49 -2.84255 0.49687 -5.721 1.10e-08
## age_group50-54 -3.27784 0.48195 -6.801 1.11e-11
## age_group55-59 -4.24987 0.47404 -8.965 < 2e-16
## age_group60-64 -4.26404 0.45671 -9.336 < 2e-16
## age_group65-69 -5.25062 0.44662 -11.756 < 2e-16
## age_group70-74 -5.71106 0.45439 -12.569 < 2e-16
## age_group75-79 -5.90758 0.47018 -12.565 < 2e-16
## age_group80+ -6.49946 0.47359 -13.724 < 2e-16
## income$15,000 to < $25,000 -1.63574 0.46999 -3.480 0.000503
## income$25,000 to < $35,000 -2.07457 0.44862 -4.624 3.82e-06
## income$35,000 to < $50,000 -2.54551 0.43915 -5.796 7.03e-09
## income$50,000 to < $100,000 -3.04298 0.41157 -7.394 1.57e-13
## income$100,000 to < $200,000 -3.50972 0.42999 -8.162 3.79e-16
## income$200,000 or more -3.84047 0.50103 -7.665 2.00e-14
## educationHigh school diploma or GED 0.12556 0.81238 0.155 0.877177
## educationSome college or technical school 1.11789 0.69123 1.617 0.105865
## educationCollege graduate 1.81788 0.69283 2.624 0.008711
## educationGraduate or professional degree 1.66905 0.69428 2.404 0.016240
## sexFemale:smoker_currentCurrent smoker 1.28327 0.51705 2.482 0.013089
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## smoker_currentCurrent smoker ***
## exerciseYes **
## age_group25-29 .
## age_group30-34 .
## age_group35-39 **
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to < $25,000 ***
## income$25,000 to < $35,000 ***
## income$35,000 to < $50,000 ***
## income$50,000 to < $100,000 ***
## income$100,000 to < $200,000 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## sexFemale:smoker_currentCurrent smoker *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared: 0.1826, Adjusted R-squared: 0.1798
## F-statistic: 63.61 on 28 and 7971 DF, p-value: < 2.2e-16
# Step 3: Test interaction with partial F-test
interaction_test <- anova(model_A, model_B)
interaction_test## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7972 432509
## 2 7971 432175 1 333.97 6.1598 0.01309 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Intercept)
## 6.89937969
## physhlth_days
## 0.26858758
## bmi
## 0.03308926
## sexFemale
## 1.18553147
## smoker_currentCurrent smoker
## 1.52079072
## exerciseYes
## -0.67284825
## age_group25-29
## -0.92018465
## age_group30-34
## -0.89241633
## age_group35-39
## -1.59287470
## age_group40-44
## -2.62863606
## age_group45-49
## -2.84254506
## age_group50-54
## -3.27783622
## age_group55-59
## -4.24986875
## age_group60-64
## -4.26403799
## age_group65-69
## -5.25061692
## age_group70-74
## -5.71105572
## age_group75-79
## -5.90757854
## age_group80+
## -6.49945710
## income$15,000 to < $25,000
## -1.63574119
## income$25,000 to < $35,000
## -2.07456733
## income$35,000 to < $50,000
## -2.54550750
## income$50,000 to < $100,000
## -3.04298017
## income$100,000 to < $200,000
## -3.50972483
## income$200,000 or more
## -3.84047036
## educationHigh school diploma or GED
## 0.12555591
## educationSome college or technical school
## 1.11789017
## educationCollege graduate
## 1.81787943
## educationGraduate or professional degree
## 1.66905037
## sexFemale:smoker_currentCurrent smoker
## 1.28327305
effect_men <- coef(model_B)["smoker_currentCurrent smoker"]
effect_women <- coef(model_B)["smoker_currentCurrent smoker"] +
coef(model_B)["sexFemale:smoker_currentCurrent smoker"]
effect_men## smoker_currentCurrent smoker
## 1.520791
## smoker_currentCurrent smoker
## 2.804064
## smoker_currentCurrent smoker
## 1.521
## smoker_currentCurrent smoker
## 2.804
# Step 5: Visualize the interaction
interaction_plot_data <- ggpredict(model_B, terms = c("smoker_current", "sex"))
plot(interaction_plot_data) +
labs(
x = "Smoking status",
y = "Predicted mentally unhealthy days",
title = "Predicted mentally unhealthy days by smoking status and sex",
color = "Sex"
)Hypotheses
H₀: The interaction between sex and current smoking equals 0 (no effect modification). Hₐ: The interaction between sex and current smoking is not 0 (effect modification exists).
Results
From ANOVA table:
F(1, 7971) = 6.16 p = 0.0131 Conclusion
Since p = 0.013 < 0.05, we reject the null hypothesis
There is statistically significant evidence that the association between current smoking and mentally unhealthy days differs by sex.
Effect among men (reference group)
Coefficient:
smoker_currentCurrent smoker = 1.52079
Rounded:
Among men, being a current smoker is associated with 1.521 additional mentally unhealthy days compared with non-smokers, holding all other variables constant.
Effect among women
Smoking effect = 1.52079 Interaction = 1.28327
Total: 1.52079 + 1.28327 = 2.80406
Rounded:
Among women, being a current smoker is associated with 2.804 additional mentally unhealthy days compared with non-smokers, holding all other variables constant.
The association between current smoking and mentally unhealthy days is significantly stronger among women than men. While male smokers report approximately 1.5 additional mentally unhealthy days compared to non-smokers, female smokers report about 2.8 additional days. This difference of roughly 1.28 days indicates that smoking has a greater negative impact on mental health among women, suggesting meaningful effect modification by sex.
Current smoking is associated with worse mental health among U.S. adults, and this relationship differs between men and women. While both male and female smokers report more mentally unhealthy days than non-smokers, the increase is notably larger among women. This suggests that women may experience a greater mental health burden related to smoking or may be more vulnerable to its psychological effects. From a public health perspective, these findings highlight the importance of integrating mental health support into smoking cessation programs and suggest that interventions may need to be tailored by sex to be most effective.
This analysis suggests that mental health among U.S. adults is shaped by a combination of physical health, behavioral factors, and socioeconomic conditions. The strongest predictor was physically unhealthy days, which showed a large and highly significant positive association with mentally unhealthy days, indicating that individuals experiencing more physical illness also report substantially worse mental health. Current smoking was also an important predictor, with a stronger association among women than men, highlighting meaningful effect modification by sex. Income and age were also strongly associated, with higher income levels and older age groups generally reporting fewer mentally unhealthy days. In contrast, some education categories (e.g., high school diploma or GED) and certain younger age groups were not statistically significant, suggesting weaker independent contributions after adjustment.
Several limitations should be considered. Because the data are cross-sectional, causal relationships cannot be established, and reverse causation is possible (e.g., poor mental health may influence smoking behavior). Additionally, all variables are self-reported, introducing potential recall bias and misclassification. Residual confounding is also a concern. For example, access to healthcare may influence both physical and mental health outcomes, and underlying mental health diagnoses (such as depression or anxiety disorders) could affect both smoking behavior and reported mentally unhealthy days. Other important confounders may include employment status and social support, which were not included in the model.
Adjusted R-squared provides a more informative measure of model fit than R-squared because it accounts for the number of predictors included. While R-squared will always increase when additional variables are added, Adjusted R-squared penalizes unnecessary complexity and helps determine whether added predictors meaningfully improve explanatory power. This distinction is especially important when including categorical variables like income and education, which introduce multiple parameters simultaneously.
Finally, chunk tests are useful because they evaluate the collective contribution of related predictors. Even if individual levels of a categorical variable are not statistically significant, the variable as a whole may still explain meaningful variation in the outcome. This provides a more complete understanding of how broader constructs, such as socioeconomic status, influence mental health beyond individual category comparisons.
End of Homework 3