Part 0: Data Acquisition and Preparation

Step 1: Downloading Packages

# Load necessary libraries
library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.5.2
library(kableExtra)     
library(car)        
library(broom) 
library(haven)
library(dplyr)
library(janitor)
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.5.2
library(gt)
## Warning: package 'gt' was built under R version 4.5.2
library(ggeffects)
## Warning: package 'ggeffects' was built under R version 4.5.2
library(ggplot2)

Step 2: Downloading Dataset and Cleaning Names

brfss_2023 <- readRDS("brfss_2023.rds")
brfss_2023 <- brfss_2023 %>%
  clean_names()

The raw dataset, brfss_2023 has 433,323 observations and 350 variables.

Step 3: Cleaning Dataset & Recoding Variables

brfss_clean <- brfss_2023 |>
  mutate(
    
    # Mental health days
    menthlth_days = case_when(
      menthlth == 88 ~ 0,
      menthlth %in% c(77, 99) ~ NA_real_,
      menthlth >= 1 & menthlth <= 30  ~ as.numeric(menthlth), 
      TRUE ~ NA_real_
    ),
    
    # Physical unhealth days
    physhlth_days = case_when(
      physhlth == 88 ~ 0,
      physhlth %in% c(77, 99) ~ NA_real_,
      physhlth >= 1 & physhlth <= 30  ~ as.numeric(physhlth),
      TRUE ~ NA_real_
    ),
    
    # BMI
    bmi = case_when(
      bmi5 == 9999 ~ NA_real_,
      TRUE ~ as.numeric(bmi5) / 100
    ),
    
    # Sex
    sex = factor(case_when(
      sexvar == 1 ~ "Male",
      sexvar == 2 ~ "Female",
      TRUE ~ NA_character_
    ),
    levels = c("Male", "Female")),
    
    # Exercise
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      exerany2 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),
    levels = c("No","Yes")),
    
    # Age group
    age_group = factor(case_when(
      ageg5yr == 1 ~ "18-24",
      ageg5yr == 2 ~ "25-29",
      ageg5yr == 3 ~ "30-34",
      ageg5yr == 4 ~ "35-39",
      ageg5yr == 5 ~ "40-44",
      ageg5yr == 6 ~ "45-49",
      ageg5yr == 7 ~ "50-54",
      ageg5yr == 8 ~ "55-59",
      ageg5yr == 9 ~ "60-64",
      ageg5yr == 10 ~ "65-69",
      ageg5yr == 11 ~ "70-74",
      ageg5yr == 12 ~ "75-79",
      ageg5yr == 13 ~ "80+",
      ageg5yr == 14 ~ NA_character_, 
      TRUE ~ NA_character_
    ),
    levels=c("18-24",
             "25-29",
             "30-34",
             "35-39",
             "40-44",
             "45-49",
             "50-54",
             "55-59",
             "60-64",
             "65-69",
             "70-74",
             "75-79",
             "80+")),
    
  # Income
   income = factor(case_when(
        incomg1 == 1 ~ "Less than $15,000",
        incomg1 == 2 ~ "$15,000-$24,999",
        incomg1 == 3 ~ "$25,000-$34,999",
        incomg1 == 4 ~ "$35,000-$49,999",
        incomg1 == 5 ~ "$50,000-$99,999",
        incomg1 == 6 ~ "$100,000-$199,999",
        incomg1 == 7 ~ "$200,000 or more",
        incomg1 == 9 ~ NA_character_,
        TRUE         ~ NA_character_
      ),
    levels = c("Less than $15,000",
               "$15,000-$24,999",
               "$25,000-$34,999",
               "$35,000-$49,999",
               "$50,000-$99,999",
               "$100,000-$199,999",
               "$200,000 or more")),
    
    # Education
    education = factor(case_when(
      educa %in% c(1, 2) ~ "Less than high school",
      educa == 3 ~ "High school diploma or GED",
      educa == 4 ~ "Some college or technical school",
      educa == 5 ~ "College graduate",
      educa == 6 ~ "Graduate or professional degree",
      educa == 9 ~ NA_character_, 
      TRUE ~ NA_character_
    ), 
     levels = c("Less than high school",
                 "High school diploma or GED",
                 "Some college or technical school",
                 "College graduate",
                 "Graduate or professional degree")),
    
    # Smoking
    smoking = factor(case_when(
      smoker3 == 1 ~ "Current daily smoker",
      smoker3 == 2 ~ "Current some-day smoker",
      smoker3 == 3 ~ "Former smoker",
      smoker3 == 4 ~ "Never smoker",
      smoker3 == 9 ~ NA_character_
    ), 
    levels = c("Current daily smoker",
               "Current some-day smoker",
               "Former smoker","Never smoker"))
    
  ) |> 
  
select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking)

Step 4: Report Missing Variables

cat("Total N:", nrow(brfss_clean), "\n")
## Total N: 433323
# Missing values for key variables
cat("Missing menthlth_days:", sum(is.na(brfss_clean$menthlth_days)), 
    "(", round(mean(is.na(brfss_clean$menthlth_days)) * 100, 1), "%)\n")
## Missing menthlth_days: 8108 ( 1.9 %)
cat("Missing physhlth_days:", sum(is.na(brfss_clean$physhlth_days)), 
    "(", round(mean(is.na(brfss_clean$physhlth_days)) * 100, 1), "%)\n")
## Missing physhlth_days: 10785 ( 2.5 %)
cat("Missing bmi:", sum(is.na(brfss_clean$bmi)), 
    "(", round(mean(is.na(brfss_clean$bmi)) * 100, 1), "%)\n")
## Missing bmi: 40535 ( 9.4 %)

Step 5: Drop Missing Values

set.seed(1220)
brfss_analytic <- brfss_clean |>
 drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
 age_group, income, education, smoking) |>
 slice_sample(n = 8000)

Step 6: Saving RDS File

saveRDS(brfss_analytic, "brfss_analytic.rds")

Step 7: Descriptive Statistics Table

brfss_analytic |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking) |>
  tbl_summary(
    by = sex,
    label = list(
      menthlth_days ~ "Mentally Unhealthy Days (past 30)",
      physhlth_days ~ "Physically Unhealthy Days (past 30)",
      bmi ~ "BMI",
      exercise ~ "Any Physical Activity (past 30 days)",
      age_group ~ "Age Category (years)",
      income ~ "Income Category",
      education ~ "Education Level",
      smoking ~ "Smoking Status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  italicize_levels() |>
  modify_caption("**Descriptive Statistics — BRFSS 2023, Stratified by Sex**") |>
  as_gt()
Descriptive Statistics — BRFSS 2023, Stratified by Sex
Characteristic N Male
N = 3,936
1
Female
N = 4,064
1
Mentally Unhealthy Days (past 30) 8,000 3.5 (7.5) 5.1 (8.6)
Physically Unhealthy Days (past 30) 8,000 4.0 (8.4) 4.9 (8.9)
BMI 8,000 28.7 (6.0) 28.7 (7.0)
Any Physical Activity (past 30 days) 8,000 3,146 (80%) 3,094 (76%)
Age Category (years) 8,000

    18-24
235 (6.0%) 171 (4.2%)
    25-29
219 (5.6%) 189 (4.7%)
    30-34
253 (6.4%) 210 (5.2%)
    35-39
263 (6.7%) 302 (7.4%)
    40-44
290 (7.4%) 292 (7.2%)
    45-49
266 (6.8%) 252 (6.2%)
    50-54
305 (7.7%) 303 (7.5%)
    55-59
308 (7.8%) 352 (8.7%)
    60-64
408 (10%) 379 (9.3%)
    65-69
418 (11%) 483 (12%)
    70-74
382 (9.7%) 426 (10%)
    75-79
325 (8.3%) 338 (8.3%)
    80+
264 (6.7%) 367 (9.0%)
Income Category 8,000

    Less than $15,000
160 (4.1%) 247 (6.1%)
    $15,000-$24,999
271 (6.9%) 370 (9.1%)
    $25,000-$34,999
376 (9.6%) 495 (12%)
    $35,000-$49,999
482 (12%) 585 (14%)
    $50,000-$99,999
1,251 (32%) 1,260 (31%)
    $100,000-$199,999
996 (25%) 869 (21%)
    $200,000 or more
400 (10%) 238 (5.9%)
Education Level 8,000

    Less than high school
75 (1.9%) 49 (1.2%)
    High school diploma or GED
130 (3.3%) 122 (3.0%)
    Some college or technical school
950 (24%) 877 (22%)
    College graduate
1,018 (26%) 1,120 (28%)
    Graduate or professional degree
1,763 (45%) 1,896 (47%)
Smoking Status 8,000

    Current daily smoker
339 (8.6%) 319 (7.8%)
    Current some-day smoker
151 (3.8%) 117 (2.9%)
    Former smoker
1,207 (31%) 1,055 (26%)
    Never smoker
2,239 (57%) 2,573 (63%)
1 Mean (SD); n (%)

Step 8: Reporting Final Analytic Sample

nrow(brfss_analytic)
## [1] 8000

The final analytic sample is 8000 observations with 9 variables after cleaning the dataset, recoding missing variables, and dropping any missing values.

Part 1: Multiple Linear Regression

Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?

Step 1: Multiple Linear Regression Model

# Full MLR Model: Unadjusted
m_full <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)

# Summary
summary(m_full)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                9.65053    0.95407  10.115  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000-$24,999                     -1.63703    0.46899  -3.491 0.000485
## income$25,000-$34,999                     -2.07637    0.44797  -4.635 3.63e-06
## income$35,000-$49,999                     -2.54685    0.43819  -5.812 6.40e-09
## income$50,000-$99,999                     -3.05048    0.41069  -7.428 1.22e-13
## income$100,000-$199,999                   -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school diploma or GED        0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000-$24,999                     ***
## income$25,000-$34,999                     ***
## income$35,000-$49,999                     ***
## income$50,000-$99,999                     ***
## income$100,000-$199,999                   ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

Step 2: Fitted Regression Equation

Coefficients Table

tidy(m_full, conf.int = TRUE) %>%
  mutate(
   across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption   = "Table 1. Multiple Linear Regression: Full Model (BRFSS 2023, n = 8,000)",
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
    format = "html"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = TRUE) %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(c(2, 3), background = "#EBF5FB")
Table 1. Multiple Linear Regression: Full Model (BRFSS 2023, n = 8,000)
Term Estimate Std. Error t-statistic p-value 95% CI Lower 95% CI Upper
(Intercept) 9.6505 0.9541 10.1151 0.0000 7.7803 11.5208
physhlth_days 0.2656 0.0101 26.3841 0.0000 0.2459 0.2853
bmi 0.0334 0.0132 2.5274 0.0115 0.0075 0.0593
sexFemale 1.3904 0.1675 8.3007 0.0000 1.0620 1.7187
exerciseYes -0.6512 0.2147 -3.0326 0.0024 -1.0721 -0.2303
age_group25-29 -1.0566 0.5194 -2.0343 0.0420 -2.0747 -0.0385
age_group30-34 -1.0939 0.5065 -2.1600 0.0308 -2.0867 -0.1012
age_group35-39 -1.8110 0.4885 -3.7072 0.0002 -2.7686 -0.8534
age_group40-44 -2.8931 0.4875 -5.9346 0.0000 -3.8487 -1.9375
age_group45-49 -3.0562 0.4977 -6.1408 0.0000 -4.0318 -2.0806
age_group50-54 -3.5167 0.4832 -7.2775 0.0000 -4.4640 -2.5695
age_group55-59 -4.4960 0.4755 -9.4543 0.0000 -5.4282 -3.5638
age_group60-64 -4.5221 0.4585 -9.8633 0.0000 -5.4209 -3.6234
age_group65-69 -5.5779 0.4502 -12.3903 0.0000 -6.4604 -4.6955
age_group70-74 -6.0254 0.4574 -13.1728 0.0000 -6.9220 -5.1287
age_group75-79 -6.2866 0.4748 -13.2392 0.0000 -7.2174 -5.3557
age_group80+ -6.8197 0.4768 -14.3019 0.0000 -7.7544 -5.8850
income$15,000-$24,999 -1.6370 0.4690 -3.4905 0.0005 -2.5564 -0.7177
income$25,000-$34,999 -2.0764 0.4480 -4.6351 0.0000 -2.9545 -1.1982
income$35,000-$49,999 -2.5469 0.4382 -5.8122 0.0000 -3.4058 -1.6879
income$50,000-$99,999 -3.0505 0.4107 -7.4277 0.0000 -3.8555 -2.2454
income$100,000-$199,999 -3.4998 0.4292 -8.1537 0.0000 -4.3413 -2.6584
income$200,000 or more -3.7841 0.5004 -7.5628 0.0000 -4.7649 -2.8033
educationHigh school diploma or GED 0.0799 0.8107 0.0986 0.9215 -1.5092 1.6690
educationSome college or technical school 1.0768 0.6897 1.5612 0.1185 -0.2753 2.4288
educationCollege graduate 1.7909 0.6912 2.5911 0.0096 0.4360 3.1458
educationGraduate or professional degree 1.7378 0.6925 2.5095 0.0121 0.3803 3.0953
smokingCurrent some-day smoker -1.5867 0.5352 -2.9645 0.0030 -2.6359 -0.5375
smokingFormer smoker -1.9897 0.3371 -5.9020 0.0000 -2.6506 -1.3289
smokingNever smoker -2.9368 0.3216 -9.1313 0.0000 -3.5673 -2.3063

menthlth_days = 9.6505 + (0.266 * physhlth_days) + (0.033 * bmi) + (1.390 * sexFemale) - (0.651 * exerciseYes) - (1.057 * age_group25-29) - (1.094 * age_group30-34) - (1.811 * age_group35-39) - (2.893 * age_group40-44) - (3.056 * age_group45-49) - (3.517 * age_group50-54) - (4.496 * age_group55-59) + - (4.522 * age_group60-64) - (5.578 * age_group65_69) - (6.025 * age_group70-74) - (6.287 * age_group75-79) - (6.820 * age_group80+) - (1.637 * income_15,000−24,999) - (2.076 * income_25,000-34,999) - (2.547 * income_35,000-49,999) - (3.051 * income_50,000-99,999) - (3.500 * income_100,000-199,999) - (3.784 * income$200,000 or more) + (0.080 * educationHigh school diploma or GED) + (1.077 * educationSome college or technical school) + (1.791 * educationCollege graduate) + (1.738 * educationGraduate or professional degree) - (1.587 * smokingCurrent some-day smoker) - (1.990 * smokingFormer smoker) - (2.937 * smokingNever smoker)

Step 3: Coefficient Interpretations

physhlth_days (continuous predictor): For each one unit increase in physically unhealthy day, the mentally unhealthy days increases by 0.266 units, holding all other predictors constant. This is the strongest continuous predictor and suggests that worse physical health is linked to worse mental health.

bmi (continuous predictor): For each one unit increase in BMI is associated with roughly 0.033 more mentally unhealthy days, holding all other predictors constant. Generally, this means that higher body weight is modestly linked to poorer mental health.

sex: Female vs. Male (reference): Females report about 1.39 more mentally unhealthy days than males. This indicates females experiences slightly more poor mental health days than males, holding all other factors constant.

exercise: Yes vs. No (reference): People who exercise report 0.651 fewer mentally unhealthy days than those who do not. This shows exercise is associated with modestly better mental health, holding all other factors constant.

Age group, 40-45: Adults aged 40-45 report 2.893 fewer mentally unhealthy days in the past 30 days compared to young adults (18–24). This shows that those who are middle-aged adults appear to have better mental health compared to younger adults, holding all other factors constant.

Income, $50,000-$99,999: People earning $50,000–$99,000 report about 3.051 fewer mentally unhealthy days than those earning less than $15,000. Higher income is linked to better mental health, holding all other factors constant.

Income, $200,00: Those earning $200,000 or more report 3.784 fewer mentally unhealthy days compared to the lowest income group. This means that very high income is associated with the fewest mentally unhealthy days, holding all other factors constant.

Step 4: Model-Level Statistics - R², Adjusted R², & F-Test

# Extract model-level statistics
mod_stats <- glance(m_full)

# Create a tidy table
tibble(
  Statistic = c(
    "R-squared",
    "Adjusted R-squared",
    "Root MSE (Residual Standard Error)",
    "Overall F-statistic",
    "Numerator df (p)",
    "Denominator df (n - p - 1)",
    "Overall p-value"
  ),
  Value = c(
    round(mod_stats$r.squared, 4),
    round(mod_stats$adj.r.squared, 4),
    round(mod_stats$sigma, 3),
    round(mod_stats$statistic, 2),
    mod_stats$df,           # numerator df
    mod_stats$df.residual,  # denominator df
    format.pval(mod_stats$p.value, digits = 3)
  )
) |>
  kable(caption = "Table 1.1. Model-Level Summary Statistics — BRFSS 2023") |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table 1.1. Model-Level Summary Statistics — BRFSS 2023
Statistic Value
R-squared 0.1853
Adjusted R-squared 0.1824
Root MSE (Residual Standard Error) 7.352
Overall F-statistic 62.52
Numerator df (p) 29
Denominator df (n - p - 1) 7970
Overall p-value <2e-16

R² = 0.1853: About 18.5% of the variation in mentally unhealthy days is explained by the predictors physhlth_days, bmi, sex, exercise, age_group, income, education, smoking in the model.

Adjusted R² = 0.1824: The adjusted R² is slightly lower than the unadjusted R², meaning that the number of predictors in the model may contain some variables that do not substantially improve its explanatory power. Because adjusted R² penalizes the addition of unnecessary predictors, this suggests that while the model explains about 18.24% of the variance in the outcome, some predictors may be contributing little and the model could potentially be simplified without losing much explanatory value.

Root MSE = 7.352: This indicates that the model’s predicted number of mentally unhealthy days typically differs from the actual reported value by about 7.35 days. In practical terms, this suggests that while the model captures some patterns in the data, there is still a considerable amount of prediction error for individual respondents. This level of error indicates moderate predictive accuracy and implies that other factors not included in the model may also influence mentally unhealthy days.

F-Test Results: H₀ (Null Hypothesis): All regression coefficients = 0. The F-statistic = 62.52 and overall p-value = <2e-16, we are going to reject H₀, indicating that the model is statisticallt significant and at least one predictor contributes significantly to explaining variation in mentally unhealthy days.

Part 2: Tests of Hypotheses

Step 1: Type III (Partial) Sums of Squares

anova_3 <- Anova(m_full, type = "III")

anova_3 |>
  tidy() |>
  mutate(
    across(where(is.numeric), \(x) round(x, 4)),
    `Significant (alpha = 0.05)` = ifelse(p.value < 0.05, "Yes", "No")
  ) |>
  kable(
    caption = "Table 2: Type III Partial Sum of Squares - Testing All Predictor's Contribution",
    col.names = c("Predictor","Sum of Sq","df","F value","p-value","Significant?")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  column_spec(6, bold = TRUE)
Table 2: Type III Partial Sum of Squares - Testing All Predictor’s Contribution
Predictor Sum of Sq df F value p-value Significant?
(Intercept) 5529.7722 1 102.3152 0.0000 Yes
physhlth_days 37622.7952 1 696.1198 0.0000 Yes
bmi 345.2408 1 6.3879 0.0115 Yes
sex 3723.8662 1 68.9012 0.0000 Yes
exercise 497.0434 1 9.1966 0.0024 Yes
age_group 30092.1774 12 46.3986 0.0000 Yes
income 4733.8943 6 14.5982 0.0000 Yes
education 1265.1504 4 5.8521 0.0001 Yes
smoking 5101.1076 3 31.4613 0.0000 Yes
Residuals 430750.0872 7970 NA NA NA

Based on the Type III (Partial) Sums of Squares, all of the predictors have statistically significant partial associations at α = 0.05. Physically unhealthy days (physhlth_days) is the strongest predictor — as it has the largest sum of squares and the largest F-statistic. This means that it explains the most variation in mentally unhealthy days after controlling for everything else.

Step 2: Chunk Test Wihtout Income

# Reduced model (remove income)
m_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking, data = brfss_analytic)

# Compare models
anova_no_income <- anova(m_no_income, m_full) |>
  as.data.frame() |>
  rownames_to_column("Model") |>
  mutate(
    Model = c("Reduced (physhlth_days + bmi + sex + exercise + age_group + education + smoking)", "Full Model"),
    across(where(is.numeric), ~ round(., 4))
  ) |>
  kable(
    caption = "Table 2.1. Chunk Test: Comparison Between Reduced Model Without Income and Full Model",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

anova_no_income
Table 2.1. Chunk Test: Comparison Between Reduced Model Without Income and Full Model
Model Res. df RSS df Sum of Sq F p-value
Reduced (physhlth_days + bmi + sex + exercise + age_group + education + smoking) 7976 435484.0 NA NA NA NA
Full Model 7970 430750.1 6 4733.894 14.5982 0

Null Hypothesis (H₀): All income coefficients = 0, and income, as a group, is not associated with mentally unhealthy days and does not improve the model.

Alternative Hypothesis (H₁): At least one income coefficient ≠ 0, and income, as a group, is associated with mentally unhealthy days and improves the model.

Test Results: F-statistic: F(6, 7970) = 14.5982 Degrees of freedom: numerator = 6, denominator = 7970 p-value: 0

Conclusion: Since the p-value = 0 which is less than 0.05, we reject the null hypothesis. Income, as a group, significantly improves the model and is associated with mentally unhealthy days after adjusting for all other predictors.

Step 3: Chunk Test Without Education

# Reduced model (remove education)
m_no_edu <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking, data = brfss_analytic)

# Compare models
anova_no_edu <- anova(m_no_edu, m_full) |>
  as.data.frame() |>
  rownames_to_column("Model") |>
  mutate(
    Model = c("Reduced (physhlth_days + bmi + sex + exercise + age_group + income + smoking)", "Full Model"),
    across(where(is.numeric), ~ round(., 4))
  ) |>
  kable(
    caption = "Table 2.2. Chunk Test: Comparison Between Reduced Model Without Education and Full Model",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

anova_no_edu
Table 2.2. Chunk Test: Comparison Between Reduced Model Without Education and Full Model
Model Res. df RSS df Sum of Sq F p-value
Reduced (physhlth_days + bmi + sex + exercise + age_group + income + smoking) 7974 432015.2 NA NA NA NA
Full Model 7970 430750.1 4 1265.15 5.8521 1e-04

Null Hypothesis (H₀): All education coefficients = 0, and education, as a group, is not associated with mentally unhealthy days and does not improve the model,

Alternative Hypothesis (H₁): At least one education coefficient ≠ 0, and education, as a group, is associated with mentally unhealthy days and improves the model.

Test Results: F-statistic: F(4, 7970) = 5.8521 Degrees of freedom: numerator = 6, denominator = 7970 p-value: 0.0001

Conclusion: Since the p-value = 0 which is less than 0.05, we reject the null hypothesis. Education, as a group, significantly improves the model and is associated with mentally unhealthy days after adjusting for all other predictors.

Step 4: Interpretation

The Type III results indicate that several predictors including physically unhealthy days, income, education, and smoking, make significant independent contributions to explaining the variation in mentally unhealthy days after adjusting for all other variables in the model. Among these, physically unhealthy days seems to be the strongest contributor, due to having the highest sum of squares and the highest F-statistic. The chunk tests further show that both income and education, as groups, significantly improve model fit. This highlights that the overall contribution of a categorical variable can be important even when individual levels are not. Compared to individual t-tests, chunk tests evaluate the joint effect of all categories within a variable, providing a more comprehensive assessment of its overall importance in the model.

Part 3: Interaction Analysis

Step 1: Creating Binary Variable “smoker_current”

brfss_analytic <- brfss_analytic %>%
  mutate(
    smoker_current = case_when(
      smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
      smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
      TRUE ~ NA_character_
    ),
    smoker_current = factor(smoker_current, levels = c("Non-smoker", "Current smoker"))
  )

brfss_analytic %>%
  select(smoker_current) %>%
  tbl_summary(
    label = list(smoker_current ~ "Current Smoking Status"),
    statistic = all_categorical() ~ "{n} ({p}%)",
    missing = "no"
  ) %>%
  add_n() %>%
  bold_labels() %>%
  italicize_levels() %>%
  modify_caption("Table 3: Current Smoking Status")
Table 3: Current Smoking Status
Characteristic N N = 8,0001
Current Smoking Status 8,000
    Non-smoker
7,074 (88%)
    Current smoker
926 (12%)
1 n (%)

Step 2: Fiting Two Models

# Model A: 
ModelA <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education, data = brfss_analytic)

tidy(ModelA, conf.int = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 3.1. Model A — Main Effects Only",
    col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table 3.1. Model A — Main Effects Only
Term Estimate SE t p-value 95% CI Lower 95% CI Upper
(Intercept) 6.7553 0.9271 7.2869 0.0000 4.9381 8.5726
physhlth_days 0.2686 0.0101 26.6731 0.0000 0.2489 0.2884
bmi 0.0334 0.0132 2.5250 0.0116 0.0075 0.0593
sexFemale 1.3331 0.1673 7.9675 0.0000 1.0051 1.6611
smoker_currentCurrent smoker 2.1287 0.2712 7.8489 0.0000 1.5971 2.6604
exerciseYes -0.6725 0.2150 -3.1274 0.0018 -1.0940 -0.2510
age_group25-29 -0.9149 0.5198 -1.7602 0.0784 -1.9338 0.1040
age_group30-34 -0.8823 0.5061 -1.7434 0.0813 -1.8743 0.1097
age_group35-39 -1.5810 0.4877 -3.2421 0.0012 -2.5369 -0.6251
age_group40-44 -2.6157 0.4858 -5.3847 0.0000 -3.5680 -1.6635
age_group45-49 -2.8246 0.4970 -5.6836 0.0000 -3.7988 -1.8504
age_group50-54 -3.2600 0.4821 -6.7628 0.0000 -4.2050 -2.3151
age_group55-59 -4.2301 0.4741 -8.9219 0.0000 -5.1595 -3.3007
age_group60-64 -4.2484 0.4568 -9.3000 0.0000 -5.1439 -3.3529
age_group65-69 -5.2338 0.4467 -11.7163 0.0000 -6.1095 -4.3582
age_group70-74 -5.7023 0.4545 -12.5457 0.0000 -6.5933 -4.8113
age_group75-79 -5.8977 0.4703 -12.5401 0.0000 -6.8197 -4.9758
age_group80+ -6.4888 0.4737 -13.6975 0.0000 -7.4174 -5.5602
income$15,000-$24,999 -1.6797 0.4698 -3.5754 0.0004 -2.6007 -0.7588
income$25,000-$34,999 -2.1023 0.4486 -4.6861 0.0000 -2.9817 -1.2229
income$35,000-$49,999 -2.5869 0.4390 -5.8931 0.0000 -3.4474 -1.7264
income$50,000-$99,999 -3.0823 0.4114 -7.4921 0.0000 -3.8887 -2.2758
income$100,000-$199,999 -3.5360 0.4300 -8.2232 0.0000 -4.3789 -2.6931
income$200,000 or more -3.8625 0.5011 -7.7078 0.0000 -4.8448 -2.8802
educationHigh school diploma or GED 0.2139 0.8119 0.2634 0.7922 -1.3776 1.8053
educationSome college or technical school 1.1965 0.6907 1.7323 0.0833 -0.1575 2.5505
educationCollege graduate 1.9035 0.6922 2.7499 0.0060 0.5466 3.2604
educationGraduate or professional degree 1.7456 0.6938 2.5160 0.0119 0.3856 3.1057
# Model B: 
ModelB <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education, data = brfss_analytic)

tidy(ModelB, conf.int = TRUE) |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 3.2. Model B, With Interaction (sex * smoker_current)",
    col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table 3.2. Model B, With Interaction (sex * smoker_current)
Term Estimate SE t p-value 95% CI Lower 95% CI Upper
(Intercept) 6.8994 0.9286 7.4301 0.0000 5.0791 8.7196
physhlth_days 0.2686 0.0101 26.6788 0.0000 0.2489 0.2883
bmi 0.0331 0.0132 2.5017 0.0124 0.0072 0.0590
sexFemale 1.1855 0.1775 6.6784 0.0000 0.8376 1.5335
smoker_currentCurrent smoker 1.5208 0.3654 4.1621 0.0000 0.8045 2.2371
exerciseYes -0.6728 0.2150 -3.1301 0.0018 -1.0942 -0.2515
age_group25-29 -0.9202 0.5196 -1.7709 0.0766 -1.9388 0.0984
age_group30-34 -0.8924 0.5059 -1.7640 0.0778 -1.8841 0.0993
age_group35-39 -1.5929 0.4875 -3.2673 0.0011 -2.5485 -0.6372
age_group40-44 -2.6286 0.4856 -5.4127 0.0000 -3.5806 -1.6766
age_group45-49 -2.8425 0.4969 -5.7209 0.0000 -3.8165 -1.8686
age_group50-54 -3.2778 0.4820 -6.8012 0.0000 -4.2226 -2.3331
age_group55-59 -4.2499 0.4740 -8.9652 0.0000 -5.1791 -3.3206
age_group60-64 -4.2640 0.4567 -9.3364 0.0000 -5.1593 -3.3688
age_group65-69 -5.2506 0.4466 -11.7563 0.0000 -6.1261 -4.3751
age_group70-74 -5.7111 0.4544 -12.5686 0.0000 -6.6018 -4.8203
age_group75-79 -5.9076 0.4702 -12.5646 0.0000 -6.8292 -4.9859
age_group80+ -6.4995 0.4736 -13.7239 0.0000 -7.4278 -5.5711
income$15,000-$24,999 -1.6357 0.4700 -3.4804 0.0005 -2.5570 -0.7144
income$25,000-$34,999 -2.0746 0.4486 -4.6243 0.0000 -2.9540 -1.1952
income$35,000-$49,999 -2.5455 0.4392 -5.7964 0.0000 -3.4064 -1.6847
income$50,000-$99,999 -3.0430 0.4116 -7.3935 0.0000 -3.8498 -2.2362
income$100,000-$199,999 -3.5097 0.4300 -8.1623 0.0000 -4.3526 -2.6668
income$200,000 or more -3.8405 0.5010 -7.6651 0.0000 -4.8226 -2.8583
educationHigh school diploma or GED 0.1256 0.8124 0.1546 0.8772 -1.4669 1.7180
educationSome college or technical school 1.1179 0.6912 1.6172 0.1059 -0.2371 2.4729
educationCollege graduate 1.8179 0.6928 2.6239 0.0087 0.4598 3.1760
educationGraduate or professional degree 1.6691 0.6943 2.4040 0.0162 0.3081 3.0300
sexFemale:smoker_currentCurrent smoker 1.2833 0.5171 2.4819 0.0131 0.2697 2.2968

Step 3: Testing Statistical Significance

anova_interaction <- anova(ModelA, ModelB)
anova_interaction
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   7972 432509                              
## 2   7971 432175  1    333.97 6.1598 0.01309 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_interaction |>
  tidy() |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption   = "Table 3.3. Comparison Between Model A (no interaction) vs. Model B (with interaction)",
    col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table 3.3. Comparison Between Model A (no interaction) vs. Model B (with interaction)
Model Res. df RSS df Sum of Sq F p-value
menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education 7972 432508.9 NA NA NA NA
menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education 7971 432174.9 1 333.9749 6.1598 0.0131

Null Hypothesis (H₀): The interaction between sex and current smoking does not improve the model; β for the interaction = 0.

Alternative Hypothesis (H₁): The interaction between sex and current smoking significantly improves the model; β ≠ 0.

Conclusion at α = 0.05: F(1, 7971) = 6.16, p = 0.013 From these results and the p-value < 0.05, we are going to reject the null hypothesis. The interaction between sex and current smoking status is statistically significant; this means the effect of smoking on mentally unhealthy days differs by sex.

Step 4: Interpret the Interaction Using the Coefficients from Model B

# Get coefficients from Model B
Coef_B <- coef(ModelB)

# Effect of current smoking among men (reference group)
smoking_men <- Coef_B["smoker_currentCurrent smoker"]

# Effect of current smoking among women (add interaction term)
smoking_women <- smoking_men + Coef_B["sexFemale:smoker_currentCurrent smoker"]

# Difference between women and men
diff_sex <- smoking_women - smoking_men

# Print results
cat("Estimated association of current smoking with mentally unhealthy days:\n")
## Estimated association of current smoking with mentally unhealthy days:
cat("  Among men   :", round(smoking_men, 3), "days\n")
##   Among men   : 1.521 days
cat("  Among women :", round(smoking_women, 3), "days\n")
##   Among women : 2.804 days
cat("  Difference  :", round(diff_sex, 3), "days\n")
##   Difference  : 1.283 days

Interpretation: Current smoking is associated with 1.52 more mentally unhealthy days per month for men (reference group). Among women, smoking is associated with 2.80 more days, which is about 1.28 days higher than for men, indicating a stronger association in women.

Step 5: Visualize Interaction

pred_int <- ggpredict(ModelB, terms = c("smoker_current", "sex"))

ggplot(pred_int, aes(x = x, y = predicted, color = group, fill = group, group = group)) +
  geom_point(size = 4, position = position_dodge(width = 0.3)) +
  geom_line(linewidth = 1.2, position = position_dodge(width = 0.3)) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, position = position_dodge(width = 0.3)) +
  labs(
    title = "Predicted Mentally Unhealthy Days by Sex and Current Smoking Status",
    subtitle = "Model B Interaction: sex * smoker_current",
    x = "Smoking Status",
    y = "Predicted Mentally Unhealthy Days (past 30 days)",
    color = "Sex",
    fill = "Sex"
  ) +
  theme_minimal(base_size = 13) +
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")

Step 6: Interpretation of Interaction

The relationship between smoking and mental health differs for men and women. Current smokers tend to report more mentally unhealthy days than non-smokers, but this effect is stronger among women. This means that women who smoke may experience a greater burden of poor mental health compared to men who smoke. Public health interventions targeting smoking cessation could have additional mental health benefits, especially for women. Addressing smoking among women may therefore help reduce disparities in mental well-being.

Part 4: Reflection

The results of this analysis suggest both health-related factors and socioeconomic factors play an important role in shaping mental health among US adults. Physically unhealthy days is the strongest predictor of mentally unhealthy days, and explains the contribution of variation in mentally unhealthy days. This highlights the close connection between physical and mental well-being. Age groups also contributes significantly, followed by smoking status and income, indicating that both behavioral and social factors are important. While all predictors are statistically significant in the Type III analysis, some variables such as BMI and exercise had relatively smaller contributions compared to others.

A key limitation of cross-sectional survey data is that exposure and outcome are measured at the same time, making it impossible to determine the direction of relationships or establish causality. For example, it is unclear whether poor physical health leads to worse mental health or vice versa. Additionally, cross-sectional data are vulnerable to recall bias and self-reporting errors, which may affect the accuracy of variables, such as mental health days or health behaviors. Unmeasured confounding is also a concern. For example, access to healthcare could influence both health behaviors (like exercise or smoking) and mental health outcomes, and pre-existing mental health conditions could affect both current health status and reported mentally unhealthy days.

Adjusted R-squared provides a more accurate measure of model performance than regular R-squared because it accounts for the number of predictors included in the model. While R-squared will always increase as more variables are added, adjusted R-squared checks whether the new variables actually improve the model enough to justify making it more complicated. This helps ensure that improvements in model fit reflect meaningful contributions rather than over-fitting. Chunk tests further strengthen interpretation by evaluating the overall contribution of grouped variables such as income and education. Rather than relying solely on individual t-tests for each category, chunk tests assess whether the variable as a whole significantly improves the model. This is particularly useful when some individual categories may not appear statistically significant.

I used AI assistance to help visualize the interaction between sex and smoking status using ggeffects::ggpredict(). When I first created the plot, the lines were not appearing correctly due to issues with grouping and positioning in the code. AI helped identify and fix these issues by adjusting the aesthetics and ensuring the correct grouping structure. I verified the final output by confirming that the plot matched the expected interaction pattern based on my model results.