library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)

Part 0: Data Acquisition and Preparation

brfss_raw <- read_xpt("/Users/karoekor/Downloads/data files/LLCP2023.XPT ")

brfss_sel <- brfss_raw |>
  select(
    MENTHLTH,
    PHYSHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`
  )

brfss_clean <- brfss_sel |>
  mutate(
    # Outcome and continuous
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
      TRUE ~ NA_real_
    ),
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
      TRUE ~ NA_real_
    ),
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    ),
    # Binary
    sex = factor(
      case_when(SEXVAR == 1 ~ "Male", SEXVAR == 2 ~ "Female", TRUE ~ NA_character_),
      levels = c("Male", "Female")
    ),
    exercise = factor(
      case_when(EXERANY2 == 1 ~ "Yes", EXERANY2 == 2 ~ "No",
                EXERANY2 %in% c(7, 9) ~ NA_character_, TRUE ~ NA_character_),
      levels = c("No", "Yes")
    ),
    # Age group (13 levels)
    age_group = factor(
      case_when(
        `_AGEG5YR` == 1  ~ "18-24",
        `_AGEG5YR` == 2  ~ "25-29",
        `_AGEG5YR` == 3  ~ "30-34",
        `_AGEG5YR` == 4  ~ "35-39",
        `_AGEG5YR` == 5  ~ "40-44",
        `_AGEG5YR` == 6  ~ "45-49",
        `_AGEG5YR` == 7  ~ "50-54",
        `_AGEG5YR` == 8  ~ "55-59",
        `_AGEG5YR` == 9  ~ "60-64",
        `_AGEG5YR` == 10 ~ "65-69",
        `_AGEG5YR` == 11 ~ "70-74",
        `_AGEG5YR` == 12 ~ "75-79",
        `_AGEG5YR` == 13 ~ "80+",
        `_AGEG5YR` == 14 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("18-24","25-29","30-34","35-39","40-44","45-49",
                 "50-54","55-59","60-64","65-69","70-74","75-79","80+")
    ),
    # Income (7 levels)
    income = factor(
      case_when(
        `_INCOMG1` == 1 ~ "Less than $15,000",
        `_INCOMG1` == 2 ~ "$15,000–$24,999",
        `_INCOMG1` == 3 ~ "$25,000–$34,999",
        `_INCOMG1` == 4 ~ "$35,000–$49,999",
        `_INCOMG1` == 5 ~ "$50,000–$99,999",
        `_INCOMG1` == 6 ~ "$100,000–$199,999",
        `_INCOMG1` == 7 ~ "$200,000 or more",
        `_INCOMG1` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than $15,000","$15,000–$24,999","$25,000–$34,999",
                 "$35,000–$49,999","$50,000–$99,999","$100,000–$199,999","$200,000 or more")
    ),
    # Education (5 levels)
    education = factor(
      case_when(
        EDUCA %in% c(1, 2) ~ "Less than high school",
        EDUCA == 3 ~ "High school diploma or GED",
        EDUCA == 4 ~ "Some college or technical school",
        EDUCA == 5 ~ "College graduate",
        EDUCA == 6 ~ "Graduate or professional degree",
        EDUCA == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than high school","High school diploma or GED",
                 "Some college or technical school","College graduate",
                 "Graduate or professional degree")
    ),
    # Smoking (4 levels)
    smoking = factor(
      case_when(
        `_SMOKER3` == 1 ~ "Current daily smoker",
        `_SMOKER3` == 2 ~ "Current some-day smoker",
        `_SMOKER3` == 3 ~ "Former smoker",
        `_SMOKER3` == 4 ~ "Never smoker",
        `_SMOKER3` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Never smoker","Former smoker","Current some-day smoker","Current daily smoker")
    )
  )

miss_report <- brfss_clean |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking) |>
  summarise(across(everything(), list(
    n_missing = ~sum(is.na(.)),
    pct_missing = ~round(mean(is.na(.)) * 100, 1)
  ))) |>
  pivot_longer(everything(), names_to = c("variable", ".value"), names_sep = "_(?=[^_]+$)")

miss_report |>
  kable(caption = "Missing values before exclusions") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Missing values before exclusions
variable	missing
menthlth_days_n	8108.0
menthlth_days_pct	1.9
physhlth_days_n	10785.0
physhlth_days_pct	2.5
bmi_n	40535.0
bmi_pct	9.4
sex_n	0.0
sex_pct	0.0
exercise_n	1251.0
exercise_pct	0.3
age_group_n	7779.0
age_group_pct	1.8
income_n	86623.0
income_pct	20.0
education_n	2325.0
education_pct	0.5
smoking_n	23062.0
smoking_pct	5.3

set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
          age_group, income, education, smoking) |>
  slice_sample(n = 8000)

cat("Final analytic n:", nrow(brfss_analytic), "\n")

## Final analytic n: 8000

brfss_analytic |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise,
         age_group, income, education, smoking) |>
  tbl_summary(
    by = sex,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    label = list(
      menthlth_days ~ "Mentally unhealthy days",
      physhlth_days ~ "Physically unhealthy days",
      bmi ~ "BMI",
      exercise ~ "Any exercise (past 30 days)",
      age_group ~ "Age group",
      income ~ "Annual household income",
      education ~ "Education level",
      smoking ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  bold_labels()

Characteristic	Overall N = 8,000¹	Male N = 3,936¹	Female N = 4,064¹
Mentally unhealthy days	4 (8)	4 (8)	5 (9)
Physically unhealthy days	4 (9)	4 (8)	5 (9)
BMI	29 (7)	29 (6)	29 (7)
Any exercise (past 30 days)	6,240 (78%)	3,146 (80%)	3,094 (76%)
Age group
18-24	406 (5.1%)	235 (6.0%)	171 (4.2%)
25-29	408 (5.1%)	219 (5.6%)	189 (4.7%)
30-34	463 (5.8%)	253 (6.4%)	210 (5.2%)
35-39	565 (7.1%)	263 (6.7%)	302 (7.4%)
40-44	582 (7.3%)	290 (7.4%)	292 (7.2%)
45-49	518 (6.5%)	266 (6.8%)	252 (6.2%)
50-54	608 (7.6%)	305 (7.7%)	303 (7.5%)
55-59	660 (8.3%)	308 (7.8%)	352 (8.7%)
60-64	787 (9.8%)	408 (10%)	379 (9.3%)
65-69	901 (11%)	418 (11%)	483 (12%)
70-74	808 (10%)	382 (9.7%)	426 (10%)
75-79	663 (8.3%)	325 (8.3%)	338 (8.3%)
80+	631 (7.9%)	264 (6.7%)	367 (9.0%)
Annual household income
Less than $15,000	407 (5.1%)	160 (4.1%)	247 (6.1%)
$15,000–$24,999	641 (8.0%)	271 (6.9%)	370 (9.1%)
$25,000–$34,999	871 (11%)	376 (9.6%)	495 (12%)
$35,000–$49,999	1,067 (13%)	482 (12%)	585 (14%)
$50,000–$99,999	2,511 (31%)	1,251 (32%)	1,260 (31%)
$100,000–$199,999	1,865 (23%)	996 (25%)	869 (21%)
$200,000 or more	638 (8.0%)	400 (10%)	238 (5.9%)
Education level
Less than high school	124 (1.6%)	75 (1.9%)	49 (1.2%)
High school diploma or GED	252 (3.2%)	130 (3.3%)	122 (3.0%)
Some college or technical school	1,827 (23%)	950 (24%)	877 (22%)
College graduate	2,138 (27%)	1,018 (26%)	1,120 (28%)
Graduate or professional degree	3,659 (46%)	1,763 (45%)	1,896 (47%)
Smoking status
Never smoker	4,812 (60%)	2,239 (57%)	2,573 (63%)
Former smoker	2,262 (28%)	1,207 (31%)	1,055 (26%)
Current some-day smoker	268 (3.4%)	151 (3.8%)	117 (2.9%)
Current daily smoker	658 (8.2%)	339 (8.6%)	319 (7.8%)
¹ Mean (SD); n (%)

Part 1: Multiple Linear Regression

model_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

summary(model_full)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.71372    0.92549   7.254 4.42e-13
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000–$24,999                     -1.63703    0.46899  -3.491 0.000485
## income$25,000–$34,999                     -2.07637    0.44797  -4.635 3.63e-06
## income$35,000–$49,999                     -2.54685    0.43819  -5.812 6.40e-09
## income$50,000–$99,999                     -3.05048    0.41069  -7.428 1.22e-13
## income$100,000–$199,999                   -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school diploma or GED        0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingFormer smoker                       0.94710    0.19329   4.900 9.77e-07
## smokingCurrent some-day smoker             1.35011    0.46820   2.884 0.003942
## smokingCurrent daily smoker                2.93681    0.32162   9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000–$24,999                     ***
## income$25,000–$34,999                     ***
## income$35,000–$49,999                     ***
## income$50,000–$99,999                     ***
## income$100,000–$199,999                   ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingFormer smoker                      ***
## smokingCurrent some-day smoker            ** 
## smokingCurrent daily smoker               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

round(coef(model_full), 3)

##                               (Intercept) 
##                                     6.714 
##                             physhlth_days 
##                                     0.266 
##                                       bmi 
##                                     0.033 
##                                 sexFemale 
##                                     1.390 
##                               exerciseYes 
##                                    -0.651 
##                            age_group25-29 
##                                    -1.057 
##                            age_group30-34 
##                                    -1.094 
##                            age_group35-39 
##                                    -1.811 
##                            age_group40-44 
##                                    -2.893 
##                            age_group45-49 
##                                    -3.056 
##                            age_group50-54 
##                                    -3.517 
##                            age_group55-59 
##                                    -4.496 
##                            age_group60-64 
##                                    -4.522 
##                            age_group65-69 
##                                    -5.578 
##                            age_group70-74 
##                                    -6.025 
##                            age_group75-79 
##                                    -6.287 
##                              age_group80+ 
##                                    -6.820 
##                     income$15,000–$24,999 
##                                    -1.637 
##                     income$25,000–$34,999 
##                                    -2.076 
##                     income$35,000–$49,999 
##                                    -2.547 
##                     income$50,000–$99,999 
##                                    -3.050 
##                   income$100,000–$199,999 
##                                    -3.500 
##                    income$200,000 or more 
##                                    -3.784 
##       educationHigh school diploma or GED 
##                                     0.080 
## educationSome college or technical school 
##                                     1.077 
##                 educationCollege graduate 
##                                     1.791 
##  educationGraduate or professional degree 
##                                     1.738 
##                      smokingFormer smoker 
##                                     0.947 
##            smokingCurrent some-day smoker 
##                                     1.350 
##               smokingCurrent daily smoker 
##                                     2.937

glance(model_full) |>
  select(r.squared, adj.r.squared, sigma, statistic, df, df.residual, p.value) |>
  kable(digits = 4, caption = "Model-level fit statistics") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Model-level fit statistics
r.squared	adj.r.squared	sigma	statistic	df	df.residual	p.value
0.1853	0.1824	7.3516	62.5234	29	7970	0

Step 2: Fitted Regression Equation

\[\hat{Y} = 6.714 + 0.266(\text{physhlth\_days}) + 0.033(\text{bmi}) + 1.390(\text{sexFemale}) - 0.651(\text{exerciseYes}) - 1.057(\text{age}_{25-29}) - 1.094(\text{age}_{30-34}) - 1.811(\text{age}_{35-39}) - 2.893(\text{age}_{40-44}) - 3.056(\text{age}_{45-49}) - 3.517(\text{age}_{50-54}) - 4.496(\text{age}_{55-59}) - 4.522(\text{age}_{60-64}) - 5.578(\text{age}_{65-69}) - 6.025(\text{age}_{70-74}) - 6.287(\text{age}_{75-79}) - 6.820(\text{age}_{80+}) - 1.637(\text{income}_{\$15-24k}) - 2.076(\text{income}_{\$25-34k}) - 2.547(\text{income}_{\$35-49k}) - 3.050(\text{income}_{\$50-99k}) - 3.500(\text{income}_{\$100-199k}) - 3.784(\text{income}_{\$200k+}) + 0.080(\text{educ\_HS}) + 1.077(\text{educ\_SomeCollege}) + 1.791(\text{educ\_CollegeGrad}) + 1.738(\text{educ\_GradDeg}) + 0.947(\text{smoking\_Former}) + 1.350(\text{smoking\_SomeDay}) + 2.937(\text{smoking\_Daily})\]

Step 3: Coefficient Interpretations

physhlth_days (β = 0.266): For each additional physically unhealthy day in the past 30, the number of mentally unhealthy days is estimated to be 0.266 days higher, holding all other variables constant.

bmi (β = 0.033): For each one-unit increase in BMI, the number of mentally unhealthy days is estimated to be 0.033 days higher, holding all other variables constant.

sexFemale (β = 1.390): Females are estimated to report 1.390 more mentally unhealthy days in the past 30 compared to males (the reference category), holding all other variables constant.

exerciseYes (β = −0.651): Those who exercised in the past 30 days are estimated to report 0.651 fewer mentally unhealthy days compared to those who did not exercise (the reference category), holding all other variables constant.

age_group60–64 (β = −4.496): Adults aged 60–64 are estimated to report 4.496 fewer mentally unhealthy days compared to those aged 18–24 (the reference category), holding all other variables constant.

income$50,000–$99,999 (β = −3.050): Adults with annual household incomes of $50,000–$99,999 are estimated to report 3.050 fewer mentally unhealthy days compared to those earning less than $15,000 (the reference category), holding all other variables constant.

income$200,000 or more (β = −3.784): Adults with annual household incomes of $200,000 or more are estimated to report 3.784 fewer mentally unhealthy days compared to those earning less than $15,000 (the reference category), holding all other variables constant.

Step 4: Model-Level Statistics

R-squared = 0.1853: The model explains 18.53% of the variance in mentally unhealthy days across the 8,000 respondents.

Adjusted R-squared = 0.1824: After penalizing for the 29 predictors in the model, 18.24% of variance is explained. The small difference from R² (0.0029) indicates the predictors collectively contribute meaningful explanatory power and the model is not substantially overfitted.

Root MSE (Residual Standard Error) = 7.352: On average, the model’s predicted values of mentally unhealthy days deviate from the observed values by approximately 7.35 days.

Overall F-test: The null hypothesis (H₀) states that all regression coefficients are simultaneously equal to zero — that is, none of the predictors is associated with mentally unhealthy days. The F-statistic is 62.52 on 29 (numerator) and 7970 (denominator) degrees of freedom, with p < 2.2e-16. We reject H₀ and conclude that the model as a whole explains a statistically significant amount of variance in mentally unhealthy days (p < 0.001).

Part 2: Tests of Hypotheses

# Type III partial F-tests for all predictors
car::Anova(model_full, type = "III")

## Anova Table (Type III tests)
## 
## Response: menthlth_days
##               Sum Sq   Df  F value    Pr(>F)    
## (Intercept)     2844    1  52.6240 4.418e-13 ***
## physhlth_days  37623    1 696.1198 < 2.2e-16 ***
## bmi              345    1   6.3879 0.0115095 *  
## sex             3724    1  68.9012 < 2.2e-16 ***
## exercise         497    1   9.1966 0.0024325 ** 
## age_group      30092   12  46.3986 < 2.2e-16 ***
## income          4734    6  14.5982 < 2.2e-16 ***
## education       1265    4   5.8521 0.0001064 ***
## smoking         5101    3  31.4613 < 2.2e-16 ***
## Residuals     430750 7970                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Reduced model: all predictors EXCEPT income
model_no_income <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + education + smoking,
  data = brfss_analytic
)

# Chunk test: does income improve model fit?
anova(model_no_income, model_full)

## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7976 435484                                  
## 2   7970 430750  6    4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Reduced model: all predictors EXCEPT education
model_no_educ <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + smoking,
  data = brfss_analytic
)

# Chunk test: does education improve model fit?
anova(model_no_educ, model_full)

## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7974 432015                                  
## 2   7970 430750  4    1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 1: Type III Partial F-Tests — Interpretation

The Type III ANOVA table shows partial F-tests for each predictor after accounting for all others. At α = 0.05, all predictors are statistically significant: physhlth_days (F = 696.12, p < 2.2e-16), bmi (F = 6.39, p = 0.0115), sex (F = 68.90, p < 2.2e-16), exercise (F = 9.20, p = 0.0024), age_group (F = 46.40, p < 2.2e-16), income (F = 14.60, p < 2.2e-16), education (F = 5.85, p = 0.0001), and smoking (F = 31.46, p < 2.2e-16).

Step 2: Chunk Test for Income — Inference

H₀: The six income dummy variable coefficients are simultaneously equal to zero — income as a group does not improve model fit beyond the other predictors.
Hₐ: At least one income coefficient differs from zero — income collectively improves model fit.
F = 14.598, df = 6, p < 2.2e-16. We reject H₀ and conclude that income, as a group of predictors, is statistically significantly associated with mentally unhealthy days after adjusting for all other variables in the model (p < 0.001).

Step 3: Chunk Test for Education — Inference

H₀: The four education dummy variable coefficients are simultaneously equal to zero — education as a group does not improve model fit beyond the other predictors.
Hₐ: At least one education coefficient differs from zero.
F = 5.8521, df = 4, p = 0.0001064. We reject H₀ and conclude that education level, as a group of predictors, is statistically significantly associated with mentally unhealthy days after adjusting for all other variables in the model (p < 0.001).

Step 4: Written Synthesis

The Type III results indicate that every predictor in the model has a statistically significant independent association with mentally unhealthy days after adjusting for all other variables. Physical health burden (physhlth_days), sex, age group, and smoking status show the strongest partial associations, as reflected by their large F-values, while bmi and education make smaller but still significant contributions. The chunk tests for income and education add an important layer of inference beyond the individual t-tests from summary(): rather than evaluating each dummy variable level in isolation, they test whether a multi-level categorical variable as a whole improves model fit. This is especially valuable because a given dummy level may appear non-significant individually while the variable collectively contributes meaningful explanatory power — as confirmed here, where both income and education pass the chunk test decisively despite some non-significant individual levels.

Part 3: Interaction Analysis

brfss_analytic <- brfss_analytic |>
  mutate(
    smoker_current = factor(
      case_when(
        smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
        smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
        TRUE ~ NA_character_
      ),
      levels = c("Non-smoker", "Current smoker")  # Non-smoker is reference
    )
  )

table(brfss_analytic$smoker_current, useNA = "ifany")

## 
##     Non-smoker Current smoker 
##           7074            926

# Model A: main effects only
model_A <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

summary(model_A)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2913  -3.8732  -1.6219   0.6681  30.4937 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.75533    0.92706   7.287 3.48e-13
## physhlth_days                              0.26862    0.01007  26.673  < 2e-16
## bmi                                        0.03341    0.01323   2.525 0.011589
## sexFemale                                  1.33309    0.16732   7.967 1.84e-15
## smoker_currentCurrent smoker               2.12874    0.27121   7.849 4.74e-15
## exerciseYes                               -0.67248    0.21503  -3.127 0.001770
## age_group25-29                            -0.91492    0.51978  -1.760 0.078412
## age_group30-34                            -0.88226    0.50606  -1.743 0.081301
## age_group35-39                            -1.58102    0.48765  -3.242 0.001191
## age_group40-44                            -2.61572    0.48577  -5.385 7.46e-08
## age_group45-49                            -2.82462    0.49698  -5.684 1.37e-08
## age_group50-54                            -3.26004    0.48206  -6.763 1.45e-11
## age_group55-59                            -4.23010    0.47413  -8.922  < 2e-16
## age_group60-64                            -4.24840    0.45682  -9.300  < 2e-16
## age_group65-69                            -5.23383    0.44671 -11.716  < 2e-16
## age_group70-74                            -5.70233    0.45453 -12.546  < 2e-16
## age_group75-79                            -5.89774    0.47031 -12.540  < 2e-16
## age_group80+                              -6.48879    0.47372 -13.697  < 2e-16
## income$15,000–$24,999                     -1.67974    0.46981  -3.575 0.000352
## income$25,000–$34,999                     -2.10230    0.44863  -4.686 2.83e-06
## income$35,000–$49,999                     -2.58691    0.43898  -5.893 3.95e-09
## income$50,000–$99,999                     -3.08227    0.41140  -7.492 7.50e-14
## income$100,000–$199,999                   -3.53598    0.43000  -8.223 2.30e-16
## income$200,000 or more                    -3.86251    0.50112  -7.708 1.43e-14
## educationHigh school diploma or GED        0.21386    0.81186   0.263 0.792237
## educationSome college or technical school  1.19653    0.69073   1.732 0.083264
## educationCollege graduate                  1.90349    0.69219   2.750 0.005974
## educationGraduate or professional degree   1.74564    0.69382   2.516 0.011890
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              ***
## exerciseYes                               ** 
## age_group25-29                            .  
## age_group30-34                            .  
## age_group35-39                            ** 
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000–$24,999                     ***
## income$25,000–$34,999                     ***
## income$35,000–$49,999                     ***
## income$50,000–$99,999                     ***
## income$100,000–$199,999                   ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school .  
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.366 on 7972 degrees of freedom
## Multiple R-squared:  0.182,  Adjusted R-squared:  0.1792 
## F-statistic:  65.7 on 27 and 7972 DF,  p-value: < 2.2e-16

# Model B: with sex * smoker_current interaction
model_B <- lm(
  menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

summary(model_B)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.337  -3.837  -1.604   0.628  30.426 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.89938    0.92857   7.430 1.20e-13
## physhlth_days                              0.26859    0.01007  26.679  < 2e-16
## bmi                                        0.03309    0.01323   2.502 0.012380
## sexFemale                                  1.18553    0.17752   6.678 2.58e-11
## smoker_currentCurrent smoker               1.52079    0.36539   4.162 3.19e-05
## exerciseYes                               -0.67285    0.21496  -3.130 0.001753
## age_group25-29                            -0.92018    0.51962  -1.771 0.076616
## age_group30-34                            -0.89242    0.50591  -1.764 0.077774
## age_group35-39                            -1.59287    0.48752  -3.267 0.001090
## age_group40-44                            -2.62864    0.48564  -5.413 6.39e-08
## age_group45-49                            -2.84255    0.49687  -5.721 1.10e-08
## age_group50-54                            -3.27784    0.48195  -6.801 1.11e-11
## age_group55-59                            -4.24987    0.47404  -8.965  < 2e-16
## age_group60-64                            -4.26404    0.45671  -9.336  < 2e-16
## age_group65-69                            -5.25062    0.44662 -11.756  < 2e-16
## age_group70-74                            -5.71106    0.45439 -12.569  < 2e-16
## age_group75-79                            -5.90758    0.47018 -12.565  < 2e-16
## age_group80+                              -6.49946    0.47359 -13.724  < 2e-16
## income$15,000–$24,999                     -1.63574    0.46999  -3.480 0.000503
## income$25,000–$34,999                     -2.07457    0.44862  -4.624 3.82e-06
## income$35,000–$49,999                     -2.54551    0.43915  -5.796 7.03e-09
## income$50,000–$99,999                     -3.04298    0.41157  -7.394 1.57e-13
## income$100,000–$199,999                   -3.50972    0.42999  -8.162 3.79e-16
## income$200,000 or more                    -3.84047    0.50103  -7.665 2.00e-14
## educationHigh school diploma or GED        0.12556    0.81238   0.155 0.877177
## educationSome college or technical school  1.11789    0.69123   1.617 0.105865
## educationCollege graduate                  1.81788    0.69283   2.624 0.008711
## educationGraduate or professional degree   1.66905    0.69428   2.404 0.016240
## sexFemale:smoker_currentCurrent smoker     1.28327    0.51705   2.482 0.013089
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              ***
## exerciseYes                               ** 
## age_group25-29                            .  
## age_group30-34                            .  
## age_group35-39                            ** 
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000–$24,999                     ***
## income$25,000–$34,999                     ***
## income$35,000–$49,999                     ***
## income$50,000–$99,999                     ***
## income$100,000–$199,999                   ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## sexFemale:smoker_currentCurrent smoker    *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared:  0.1826, Adjusted R-squared:  0.1798 
## F-statistic: 63.61 on 28 and 7971 DF,  p-value: < 2.2e-16

# Partial F-test: does the interaction term improve model fit?
anova(model_A, model_B)

## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   7972 432509                              
## 2   7971 432175  1    333.97 6.1598 0.01309 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Extract key coefficients for interpretation
coefs_B <- coef(model_B)

# Effect of current smoking among Males (reference sex)
beta_smoker <- coefs_B["smoker_currentCurrent smoker"]

# Interaction coefficient
beta_interaction <- coefs_B["sexFemale:smoker_currentCurrent smoker"]

# Effect among Females = beta_smoker + beta_interaction
effect_female <- beta_smoker + beta_interaction

cat("Effect of current smoking among Males:", round(beta_smoker, 3), "days\n")

## Effect of current smoking among Males: 1.521 days

cat("Interaction coefficient (Female × Current smoker):", round(beta_interaction, 3), "\n")

## Interaction coefficient (Female × Current smoker): 1.283

cat("Effect of current smoking among Females:", round(effect_female, 3), "days\n")

## Effect of current smoking among Females: 2.804 days

# Visualize interaction with ggeffects
pred <- ggpredict(model_B, terms = c("smoker_current", "sex"))

plot(pred) +
  labs(
    title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
    x = "Smoking Status",
    y = "Predicted Mentally Unhealthy Days (past 30)",
    color = "Sex"
  ) +
  theme_bw()

Step 3: Interaction Test — Inference

H₀: The association between current smoking and mentally unhealthy days does not differ by sex (the interaction coefficient equals zero).
Hₐ: The association between current smoking and mentally unhealthy days differs between males and females.
F = 6.1598, df = 1, p = 0.01309. We reject H₀ at α = 0.05 and conclude that sex statistically significantly modifies the association between current smoking and mentally unhealthy days (p = 0.013).

Step 4: Interaction Coefficient Interpretation

Effect among males (β = 1.521): Among males, current smokers are estimated to report 1.521 more mentally unhealthy days in the past 30 compared to non-smokers, holding all other variables constant.

Interaction coefficient (β = 1.283): The association between current smoking and mentally unhealthy days is estimated to be 1.283 days larger among females than among males, holding all other variables constant.

Effect among females (1.521 + 1.283 = 2.804): Among females, current smokers are estimated to report 2.804 more mentally unhealthy days compared to non-smokers, holding all other variables constant.

The estimated association between current smoking and poor mental health is nearly twice as large among females (2.804 days) as among males (1.521 days), a difference of 1.283 days. This indicates that sex modifies the relationship between smoking and mental health, with smoking being more strongly associated with psychological distress among women than men in this sample.

Step 6: Public Health Interpretation

Among U.S. adults, current smokers tend to report more days of poor mental health than non-smokers, but this pattern differs meaningfully between men and women. Women who currently smoke appear to experience a substantially greater burden of poor mental health days compared to female non-smokers, while the same difference for men, though still present, is considerably smaller. These findings suggest that tobacco use and psychological distress may be more closely intertwined in women’s health than in men’s, potentially reflecting differences in stress-related smoking motivations or the psychological effects of nicotine across sexes. Public health programs that simultaneously address smoking cessation and mental health support — particularly among women — may be especially effective in reducing the overall burden of mental health distress in the population.

Part 4: Reflection

A. Findings and Limitations

The results suggest that mental health among U.S. adults is shaped by a complex interplay of physical, behavioral, and socioeconomic factors. Physical health burden was the strongest predictor: each additional physically unhealthy day was associated with approximately 0.27 more mentally unhealthy days, a finding consistent with the well-documented bidirectional relationship between physical and mental health. Smoking status also showed strong associations, with current daily smokers reporting nearly 3 additional mentally unhealthy days compared to never smokers, after adjusting for all other predictors. Age was inversely associated with mentally unhealthy days — older adults consistently reported fewer such days compared to the youngest group (18–24), which may reflect cohort differences in distress reporting, coping capacity, or health expectations. Income and education were significant predictors as well, with higher-income respondents reporting substantially fewer mentally unhealthy days than those earning less than $15,000.

A key limitation of this analysis is its cross-sectional design: because exposure and outcome are measured at the same time point, it is not possible to establish the direction of associations. For example, it is unclear whether poor physical health leads to worse mental health or vice versa. At least two important confounders may bias the associations estimated here. First, chronic disease burden is not adjusted for in this model, individuals with conditions such as diabetes or heart disease likely have elevated rates of both physical and mental health problems, which could confound the observed associations between predictors like BMI or exercise and mentally unhealthy days. Second, social support and living situation are unmeasured factors that influence mental health and are also correlated with income, exercise behavior, and smoking status, potentially biasing several of the estimated coefficients.

B. From Simple to Multiple Regression

Adjusted R² differs from regular R² in that it penalizes for the number of predictors in the model, decreasing when added predictors do not improve fit proportionally. In this model with 29 terms, the small difference between R² (0.1853) and Adjusted R² (0.1824) suggests that the predictors collectively contribute meaningful explanatory power rather than simply inflating fit artificially. This distinction is especially important when adding multi-level categorical predictors such as age group (12 dummies) and income (6 dummies), because each additional dummy consumes a degree of freedom and R² will increase mechanically even for uninformative predictors. The chunk tests for income and education complement the individual t-tests from summary() by evaluating each categorical variable as a whole: a given dummy level may appear non-significant individually while the variable as a group still contributes meaningfully to the model. This is exactly what the chunk test is designed to detect, and both tests confirmed statistically significant collective contributions from income and education beyond what individual-level t-tests alone would reveal.

C. AI Transparency

All R code for this assignment was written and executed independently. Examples from class lecture files and additional online resources were used to troubleshoot file path errors when loading the BRFSS XPT file in RStudio Desktop on macOS, including identifying issues such as a trailing space in the filename that prevented the file from loading. These same resources were also used to guide interpretation of results for Parts 1–3, including coefficient interpretations, hypothesis test conclusions, interaction effects, and the public health summary. Output accuracy was verified by cross-checking R console results with written interpretations prior to submission.

EPI 553 HW3 – Multiple Linear Regression, Tests of Hypotheses, and Interaction Analysis

Karo Ekor

2026-04-05