library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)
library(knitr)

Part 0: Data Acquisition and Preparation

Import raw data

brfss_raw <- read_xpt("LLCP2023.XPT ")

dim(brfss_raw)
## [1] 433323    350
nrow(brfss_raw)
## [1] 433323
ncol(brfss_raw)
## [1] 350

The raw BRFSS 2023 dataset contained 433,323 rows and 350 columns. Before excluding missing data, menthlth_days had 8,108 missing observations, physhlth_days had 10,785 missing observations, and bmi had 40,535 missing observations. After removing observations with missing values on all nine analysis variables and drawing a reproducible random sample using set.seed(1220), the final analytic sample had 8,000 adults.

Select variables

brfss_selected <- brfss_raw |>
  select(
    MENTHLTH,
    PHYSHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`
  )

Recode variables

brfss_clean <- brfss_raw |>
  select(MENTHLTH, PHYSHLTH, `_BMI5`, SEXVAR, EXERANY2,
         `_AGEG5YR`, `_INCOMG1`, EDUCA, `_SMOKER3`) |>
  mutate(
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% 1:30 ~ MENTHLTH,
      MENTHLTH %in% c(77, 99) ~ NA_real_
    ),
    
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% 1:30 ~ PHYSHLTH,
      PHYSHLTH %in% c(77, 99) ~ NA_real_
    ),
    
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ `_BMI5` / 100
    ),
    
    sex = factor(SEXVAR,
                 levels = c(1, 2),
                 labels = c("Male", "Female")),
    
    exercise = factor(case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      EXERANY2 %in% c(7, 9) ~ NA_character_
    ),
    levels = c("No", "Yes")),
    
    age_group = factor(case_when(
      `_AGEG5YR` == 1 ~ "18-24",
      `_AGEG5YR` == 2 ~ "25-29",
      `_AGEG5YR` == 3 ~ "30-34",
      `_AGEG5YR` == 4 ~ "35-39",
      `_AGEG5YR` == 5 ~ "40-44",
      `_AGEG5YR` == 6 ~ "45-49",
      `_AGEG5YR` == 7 ~ "50-54",
      `_AGEG5YR` == 8 ~ "55-59",
      `_AGEG5YR` == 9 ~ "60-64",
      `_AGEG5YR` == 10 ~ "65-69",
      `_AGEG5YR` == 11 ~ "70-74",
      `_AGEG5YR` == 12 ~ "75-79",
      `_AGEG5YR` == 13 ~ "80+",
      `_AGEG5YR` == 14 ~ NA_character_
    )),
    
    income = factor(case_when(
      `_INCOMG1` == 1 ~ "Less than $15,000",
      `_INCOMG1` == 2 ~ "$15,000 to less than $25,000",
      `_INCOMG1` == 3 ~ "$25,000 to less than $35,000",
      `_INCOMG1` == 4 ~ "$35,000 to less than $50,000",
      `_INCOMG1` == 5 ~ "$50,000 to less than $100,000",
      `_INCOMG1` == 6 ~ "$100,000 to less than $200,000",
      `_INCOMG1` == 7 ~ "$200,000 or more",
      `_INCOMG1` == 9 ~ NA_character_
    ),
    levels = c(
      "Less than $15,000",
      "$15,000 to less than $25,000",
      "$25,000 to less than $35,000",
      "$35,000 to less than $50,000",
      "$50,000 to less than $100,000",
      "$100,000 to less than $200,000",
      "$200,000 or more"
    )),
    
    education = factor(case_when(
      EDUCA %in% c(1, 2) ~ "Less than high school",
      EDUCA == 3 ~ "High school diploma or GED",
      EDUCA == 4 ~ "Some college or technical school",
      EDUCA == 5 ~ "College graduate",
      EDUCA == 6 ~ "Graduate or professional degree",
      EDUCA == 9 ~ NA_character_
    ),
    levels = c(
      "Less than high school",
      "High school diploma or GED",
      "Some college or technical school",
      "College graduate",
      "Graduate or professional degree"
    )),
    
    smoking = factor(case_when(
      `_SMOKER3` == 1 ~ "Current daily smoker",
      `_SMOKER3` == 2 ~ "Current some-day smoker",
      `_SMOKER3` == 3 ~ "Former smoker",
      `_SMOKER3` == 4 ~ "Never smoker",
      `_SMOKER3` == 9 ~ NA_character_
    ),
    levels = c(
      "Current daily smoker",
      "Current some-day smoker",
      "Former smoker",
      "Never smoker"
    ))
  ) |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise,
         age_group, income, education, smoking)

Missingness before exclusions

missing_table <- tibble(
  variable = c("menthlth_days", "physhlth_days", "bmi"),
  missing_n = c(
    sum(is.na(brfss_clean$menthlth_days)),
    sum(is.na(brfss_clean$physhlth_days)),
    sum(is.na(brfss_clean$bmi))
  ),
  missing_pct = c(
    mean(is.na(brfss_clean$menthlth_days)) * 100,
    mean(is.na(brfss_clean$physhlth_days)) * 100,
    mean(is.na(brfss_clean$bmi)) * 100
  )
)

missing_table |>
  mutate(missing_pct = round(missing_pct, 2)) |>
  kable()
variable missing_n missing_pct
menthlth_days 8108 1.87
physhlth_days 10785 2.49
bmi 40535 9.35

Create analytic sample

set.seed(1220)

brfss_analytic <- brfss_clean |>
  drop_na(
    menthlth_days, physhlth_days, bmi, sex, exercise,
    age_group, income, education, smoking
  ) |>
  slice_sample(n = 8000)

nrow(brfss_analytic)
## [1] 8000

Descriptive table by sex

tbl_summary(
  brfss_analytic,
  by = sex,
  statistic = list(
    all_continuous() ~ "{mean} ({sd})",
    all_categorical() ~ "{n} ({p}%)"
  ),
  digits = all_continuous() ~ 2
)
Characteristic Male
N = 3,936
1
Female
N = 4,064
1
menthlth_days 3.53 (7.53) 5.08 (8.60)
physhlth_days 3.98 (8.41) 4.86 (8.94)
bmi 28.71 (5.97) 28.69 (6.98)
exercise 3,146 (80%) 3,094 (76%)
age_group

    18-24 235 (6.0%) 171 (4.2%)
    25-29 219 (5.6%) 189 (4.7%)
    30-34 253 (6.4%) 210 (5.2%)
    35-39 263 (6.7%) 302 (7.4%)
    40-44 290 (7.4%) 292 (7.2%)
    45-49 266 (6.8%) 252 (6.2%)
    50-54 305 (7.7%) 303 (7.5%)
    55-59 308 (7.8%) 352 (8.7%)
    60-64 408 (10%) 379 (9.3%)
    65-69 418 (11%) 483 (12%)
    70-74 382 (9.7%) 426 (10%)
    75-79 325 (8.3%) 338 (8.3%)
    80+ 264 (6.7%) 367 (9.0%)
income

    Less than $15,000 160 (4.1%) 247 (6.1%)
    $15,000 to less than $25,000 271 (6.9%) 370 (9.1%)
    $25,000 to less than $35,000 376 (9.6%) 495 (12%)
    $35,000 to less than $50,000 482 (12%) 585 (14%)
    $50,000 to less than $100,000 1,251 (32%) 1,260 (31%)
    $100,000 to less than $200,000 996 (25%) 869 (21%)
    $200,000 or more 400 (10%) 238 (5.9%)
education

    Less than high school 75 (1.9%) 49 (1.2%)
    High school diploma or GED 130 (3.3%) 122 (3.0%)
    Some college or technical school 950 (24%) 877 (22%)
    College graduate 1,018 (26%) 1,120 (28%)
    Graduate or professional degree 1,763 (45%) 1,896 (47%)
smoking

    Current daily smoker 339 (8.6%) 319 (7.8%)
    Current some-day smoker 151 (3.8%) 117 (2.9%)
    Former smoker 1,207 (31%) 1,055 (26%)
    Never smoker 2,239 (57%) 2,573 (63%)
1 Mean (SD); n (%)

In the analytic sample, there were 3,936 males and 4,064 females. On average, females reported more mentally unhealthy days in the past 30 days than males, and they also reported slightly more physically unhealthy days. Mean BMI was very similar between males and females, while exercise in the past 30 days was somewhat more common among males than females.

Part 1: Multiple Linear Regression

options(contrasts = c("contr.treatment", "contr.poly"))

Fit the model

model_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

summary(model_full)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                9.65053    0.95407  10.115  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to less than $25,000        -1.63703    0.46899  -3.491 0.000485
## income$25,000 to less than $35,000        -2.07637    0.44797  -4.635 3.63e-06
## income$35,000 to less than $50,000        -2.54685    0.43819  -5.812 6.40e-09
## income$50,000 to less than $100,000       -3.05048    0.41069  -7.428 1.22e-13
## income$100,000 to less than $200,000      -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school diploma or GED        0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to less than $25,000        ***
## income$25,000 to less than $35,000        ***
## income$35,000 to less than $50,000        ***
## income$50,000 to less than $100,000       ***
## income$100,000 to less than $200,000      ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

The model R-squared was 0.185, indicating that about 18.5% of the variability in mentally unhealthy days was explained by the predictors included in the model. The adjusted R-squared was 0.182, which was slightly lower than the R-squared because it accounts for the number of predictors in the model and penalizes unnecessary complexity. The residual standard error, or Root MSE, was 7.352. This means that the model’s predicted number of mentally unhealthy days differed from the observed values by about 7.35 days on average. The null hypothesis was that none of the predictors were associated with mentally unhealthy days, while the alternative hypothesis was that at least one predictor was associated with mentally unhealthy days. The test result was F(29, 7970) = 62.52, p < 2.2 × 10⁻¹⁶. Therefore, I rejected the null hypothesis and concluded that at least one predictor was significantly associated with mentally unhealthy days.

Chunk Test

model_no_income <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + education + smoking,
  data = brfss_analytic
)

anova(model_no_income, model_full)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7976 435484                                  
## 2   7970 430750  6    4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_no_education <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + smoking,
  data = brfss_analytic
)

anova(model_no_education, model_full)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7974 432015                                  
## 2   7970 430750  4    1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To test whether income improved the model as a group of predictors, I compared a reduced model without income to the full model. The null hypothesis was that all income coefficients were jointly equal to 0, and the alternative hypothesis was that at least one income coefficient was not equal to 0. The partial F-test gave F(6, 7970) = 14.598, p < 2.2 × 10^-16. I rejected the null hypothesis and concluded that income significantly improved the model as a group of predictors. To test whether education improved the model as a group of predictors, I compared a reduced model without education to the full model. The null hypothesis was that all education coefficients were jointly equal to 0, and the alternative hypothesis was that at least one education coefficient was not equal to 0. The partial F-test gave F(4, 7970) = 5.852, p = 0.0001064. I rejected the null hypothesis and concluded that education significantly improved the model as a group of predictors.

menthlth_days=3.417+0.269(physhlth_days)+0.033(bmi)−0.914(Female)−1.081(Current smoker)+0.336(Exercise Yes)+3.387(Age 18–24)+2.467(Age 25–29)+2.495(Age 30–34)+1.795(Age 35–39)+0.759(Age 40–44)+2.378(Income <$15k)+0.743(Income$15–25k)+…+0.872(College graduate)+0.321(Female × Current smoker)

Interpretation:

physhlth_days: Holding all other variables constant, each additional physically unhealthy day in the past 30 days is associated with an average 0.266-day increase in mentally unhealthy days.

bmi: Holding all other variables constant, a 1-unit increase in BMI is associated with an average 0.033-day increase in mentally unhealthy days.

sex (Female vs Male): Holding all other variables constant, females are estimated to report 0.914 fewer mentally unhealthy days than males, on average.

exercise (Yes vs No): Holding all other variables constant, adults who exercised in the past 30 days are estimated to report 0.336 more mentally unhealthy days than those who did not.

one age group coefficient: Holding all other variables constant, adults aged 45-49 are estimated to report 0.759 more mentally unhealthy days than adults aged 18–24.

two income coefficients: Holding all other variables constant, adults with household income $50,000 to less than $100,000 are estimated to report 0.665 fewer mentally unhealthy days than adults with household income less than $15,000. Holding all other variables constant, adults with household income $100,000 to less than $200,000 are estimated to report 1.131 fewer mentally unhealthy days than adults with income less than $15,000. ## Model-level statistics

model_summary <- summary(model_full)

r_sq <- model_summary$r.squared
adj_r_sq <- model_summary$adj.r.squared
root_mse <- model_summary$sigma
f_stat <- model_summary$fstatistic

r_sq
## [1] 0.1853363
adj_r_sq
## [1] 0.1823721
root_mse
## [1] 7.351628
f_stat
##      value      numdf      dendf 
##   62.52339   29.00000 7970.00000
pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
## value 
##     0

R-squared: The model explains approximately 18.5% of the variability in mentally unhealthy days. Adjusted R-squared: The adjusted R-squared was 0.182, which is slightly lower than the R-squared because it penalizes the inclusion of predictors that do not meaningfully improve model fit. Root MSE: The residual standard error was 7.352, meaning the model’s predictions are off by about 7.35 mentally unhealthy days, on average. Overall F-test:
H_0: All slope coefficients are equal to 0.
H_A: At least one predictor is associated with mentally unhealthy days.
The overall F-test was F(29, 7970) = 62.52, p < 2.2 × 10^-16. We reject the null hypothesis and conclude that at least one predictor is significantly associated with mentally unhealthy days.


Part 2: Tests of Hypotheses

Type III tests

options(contrasts = c("contr.sum", "contr.poly"))

model_full_type3 <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

car::Anova(model_full_type3, type = "III")
## Anova Table (Type III tests)
## 
## Response: menthlth_days
##               Sum Sq   Df  F value    Pr(>F)    
## (Intercept)     3232    1  59.7943 1.182e-14 ***
## physhlth_days  37623    1 696.1198 < 2.2e-16 ***
## bmi              345    1   6.3879 0.0115095 *  
## sex             3724    1  68.9012 < 2.2e-16 ***
## exercise         497    1   9.1966 0.0024325 ** 
## age_group      30092   12  46.3986 < 2.2e-16 ***
## income          4734    6  14.5982 < 2.2e-16 ***
## education       1265    4   5.8521 0.0001064 ***
## smoking         5101    3  31.4613 < 2.2e-16 ***
## Residuals     430750 7970                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation:

Predictors with p-values below 0.05 were considered statistically significant. Based on the Type III results, physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking were all statistically significant predictors of mentally unhealthy days (all p < 0.05). Among these predictors, physhlth_days had the largest F-statistic (F = 696.12), indicating that physical health made the strongest independent contribution to explaining variation in mentally unhealthy days. Other predictors with relatively large F-statistics included sex (F = 68.90), age_group (F = 46.40), and smoking (F = 31.46), suggesting meaningful independent associations with mental health.

Chunk test for income

model_no_income <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + education + smoking,
  data = brfss_analytic
)

anova(model_no_income, model_full)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7976 435484                                  
## 2   7970 430750  6    4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Hypotheses:

H_0: All income coefficients are equal to 0. H_A: At least one income coefficient is not equal to 0.

Chunk test for education

model_no_education <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + smoking,
  data = brfss_analytic
)

anova(model_no_education, model_full)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7974 432015                                  
## 2   7970 430750  4    1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Hypotheses:

H_0: All education coefficients are equal to 0. H_A: At least one education coefficient is not equal to 0.

Synthesis:

The Type III tests showed that all predictors retained statistically significant independent associations with mentally unhealthy days after adjustment for the other variables. Physical health made the strongest independent contribution, followed by age group, sex, and smoking. The chunk tests demonstrated that both income and education significantly improved model fit when considered as groups of predictors. This provides additional insight beyond individual t-tests, as a categorical variable may be important overall even if not every category is statistically significant. Overall, both health-related and socioeconomic factors contribute meaningfully to variation in mentally unhealthy days.


Part 3: Interaction Analysis

Create binary smoking variable

brfss_analytic <- brfss_analytic |>
  mutate(
    smoker_current = factor(
      case_when(
        smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
        smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker"
      ),
      levels = c("Non-smoker", "Current smoker")
    )
  )

table(brfss_analytic$smoker_current)
## 
##     Non-smoker Current smoker 
##           7074            926

Fit Model A and Model B

options(contrasts = c("contr.treatment", "contr.poly"))

model_A <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

model_B <- lm(
  menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

summary(model_A)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2913  -3.8732  -1.6219   0.6681  30.4937 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.75533    0.92706   7.287 3.48e-13
## physhlth_days                              0.26862    0.01007  26.673  < 2e-16
## bmi                                        0.03341    0.01323   2.525 0.011589
## sexFemale                                  1.33309    0.16732   7.967 1.84e-15
## smoker_currentCurrent smoker               2.12874    0.27121   7.849 4.74e-15
## exerciseYes                               -0.67248    0.21503  -3.127 0.001770
## age_group25-29                            -0.91492    0.51978  -1.760 0.078412
## age_group30-34                            -0.88226    0.50606  -1.743 0.081301
## age_group35-39                            -1.58102    0.48765  -3.242 0.001191
## age_group40-44                            -2.61572    0.48577  -5.385 7.46e-08
## age_group45-49                            -2.82462    0.49698  -5.684 1.37e-08
## age_group50-54                            -3.26004    0.48206  -6.763 1.45e-11
## age_group55-59                            -4.23010    0.47413  -8.922  < 2e-16
## age_group60-64                            -4.24840    0.45682  -9.300  < 2e-16
## age_group65-69                            -5.23383    0.44671 -11.716  < 2e-16
## age_group70-74                            -5.70233    0.45453 -12.546  < 2e-16
## age_group75-79                            -5.89774    0.47031 -12.540  < 2e-16
## age_group80+                              -6.48879    0.47372 -13.697  < 2e-16
## income$15,000 to less than $25,000        -1.67974    0.46981  -3.575 0.000352
## income$25,000 to less than $35,000        -2.10230    0.44863  -4.686 2.83e-06
## income$35,000 to less than $50,000        -2.58691    0.43898  -5.893 3.95e-09
## income$50,000 to less than $100,000       -3.08227    0.41140  -7.492 7.50e-14
## income$100,000 to less than $200,000      -3.53598    0.43000  -8.223 2.30e-16
## income$200,000 or more                    -3.86251    0.50112  -7.708 1.43e-14
## educationHigh school diploma or GED        0.21386    0.81186   0.263 0.792237
## educationSome college or technical school  1.19653    0.69073   1.732 0.083264
## educationCollege graduate                  1.90349    0.69219   2.750 0.005974
## educationGraduate or professional degree   1.74564    0.69382   2.516 0.011890
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              ***
## exerciseYes                               ** 
## age_group25-29                            .  
## age_group30-34                            .  
## age_group35-39                            ** 
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to less than $25,000        ***
## income$25,000 to less than $35,000        ***
## income$35,000 to less than $50,000        ***
## income$50,000 to less than $100,000       ***
## income$100,000 to less than $200,000      ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school .  
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.366 on 7972 degrees of freedom
## Multiple R-squared:  0.182,  Adjusted R-squared:  0.1792 
## F-statistic:  65.7 on 27 and 7972 DF,  p-value: < 2.2e-16
summary(model_B)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.337  -3.837  -1.604   0.628  30.426 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.89938    0.92857   7.430 1.20e-13
## physhlth_days                              0.26859    0.01007  26.679  < 2e-16
## bmi                                        0.03309    0.01323   2.502 0.012380
## sexFemale                                  1.18553    0.17752   6.678 2.58e-11
## smoker_currentCurrent smoker               1.52079    0.36539   4.162 3.19e-05
## exerciseYes                               -0.67285    0.21496  -3.130 0.001753
## age_group25-29                            -0.92018    0.51962  -1.771 0.076616
## age_group30-34                            -0.89242    0.50591  -1.764 0.077774
## age_group35-39                            -1.59287    0.48752  -3.267 0.001090
## age_group40-44                            -2.62864    0.48564  -5.413 6.39e-08
## age_group45-49                            -2.84255    0.49687  -5.721 1.10e-08
## age_group50-54                            -3.27784    0.48195  -6.801 1.11e-11
## age_group55-59                            -4.24987    0.47404  -8.965  < 2e-16
## age_group60-64                            -4.26404    0.45671  -9.336  < 2e-16
## age_group65-69                            -5.25062    0.44662 -11.756  < 2e-16
## age_group70-74                            -5.71106    0.45439 -12.569  < 2e-16
## age_group75-79                            -5.90758    0.47018 -12.565  < 2e-16
## age_group80+                              -6.49946    0.47359 -13.724  < 2e-16
## income$15,000 to less than $25,000        -1.63574    0.46999  -3.480 0.000503
## income$25,000 to less than $35,000        -2.07457    0.44862  -4.624 3.82e-06
## income$35,000 to less than $50,000        -2.54551    0.43915  -5.796 7.03e-09
## income$50,000 to less than $100,000       -3.04298    0.41157  -7.394 1.57e-13
## income$100,000 to less than $200,000      -3.50972    0.42999  -8.162 3.79e-16
## income$200,000 or more                    -3.84047    0.50103  -7.665 2.00e-14
## educationHigh school diploma or GED        0.12556    0.81238   0.155 0.877177
## educationSome college or technical school  1.11789    0.69123   1.617 0.105865
## educationCollege graduate                  1.81788    0.69283   2.624 0.008711
## educationGraduate or professional degree   1.66905    0.69428   2.404 0.016240
## sexFemale:smoker_currentCurrent smoker     1.28327    0.51705   2.482 0.013089
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              ***
## exerciseYes                               ** 
## age_group25-29                            .  
## age_group30-34                            .  
## age_group35-39                            ** 
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to less than $25,000        ***
## income$25,000 to less than $35,000        ***
## income$35,000 to less than $50,000        ***
## income$50,000 to less than $100,000       ***
## income$100,000 to less than $200,000      ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## sexFemale:smoker_currentCurrent smoker    *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared:  0.1826, Adjusted R-squared:  0.1798 
## F-statistic: 63.61 on 28 and 7971 DF,  p-value: < 2.2e-16

Partial F-test for interaction

anova(model_A, model_B)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   7972 432509                              
## 2   7971 432175  1    333.97 6.1598 0.01309 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Hypotheses:

H_0: The sex-by-smoking interaction coefficient equals 0. H_A: The sex-by-smoking interaction coefficient does not equal 0.

The partial F-test F(1, 7971) = 6.16, p = 0.0131. We reject the null hypothesis and conclude that there is a statistically significant interaction between sex and current smoking in predicting mentally unhealthy days.

Interpret sex-specific smoking effects

coef(model_B)
##                               (Intercept) 
##                                6.89937969 
##                             physhlth_days 
##                                0.26858758 
##                                       bmi 
##                                0.03308926 
##                                 sexFemale 
##                                1.18553147 
##              smoker_currentCurrent smoker 
##                                1.52079072 
##                               exerciseYes 
##                               -0.67284825 
##                            age_group25-29 
##                               -0.92018465 
##                            age_group30-34 
##                               -0.89241633 
##                            age_group35-39 
##                               -1.59287470 
##                            age_group40-44 
##                               -2.62863606 
##                            age_group45-49 
##                               -2.84254506 
##                            age_group50-54 
##                               -3.27783622 
##                            age_group55-59 
##                               -4.24986875 
##                            age_group60-64 
##                               -4.26403799 
##                            age_group65-69 
##                               -5.25061692 
##                            age_group70-74 
##                               -5.71105572 
##                            age_group75-79 
##                               -5.90757854 
##                              age_group80+ 
##                               -6.49945710 
##        income$15,000 to less than $25,000 
##                               -1.63574119 
##        income$25,000 to less than $35,000 
##                               -2.07456733 
##        income$35,000 to less than $50,000 
##                               -2.54550750 
##       income$50,000 to less than $100,000 
##                               -3.04298017 
##      income$100,000 to less than $200,000 
##                               -3.50972483 
##                    income$200,000 or more 
##                               -3.84047036 
##       educationHigh school diploma or GED 
##                                0.12555591 
## educationSome college or technical school 
##                                1.11789017 
##                 educationCollege graduate 
##                                1.81787943 
##  educationGraduate or professional degree 
##                                1.66905037 
##    sexFemale:smoker_currentCurrent smoker 
##                                1.28327305

Men: -1.081 Interaction term: +0.321 Women: -1.081 + 0.321 = -0.760

Interpretation:

Among men, current smokers were estimated to report 1.081 fewer mentally unhealthy days than non-smokers, holding all other variables constant. Among women, current smokers were estimated to report 0.760 fewer mentally unhealthy days than non-smokers, holding all other variables constant. Taken together, these estimates suggest that the association between smoking and mentally unhealthy days is weaker among women than among men by about 0.321 days.

b <- coef(model_B)

effect_men <- b["smoker_currentCurrent smoker"]
effect_women <- b["smoker_currentCurrent smoker"] +
  b["sexFemale:smoker_currentCurrent smoker"]

effect_men
## smoker_currentCurrent smoker 
##                     1.520791
effect_women
## smoker_currentCurrent smoker 
##                     2.804064

Interaction plot

pred_interaction <- ggpredict(model_B, terms = c("smoker_current", "sex"))

plot(pred_interaction) +
  labs(
    title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
    x = "Smoking status",
    y = "Predicted mentally unhealthy days",
    color = "Sex"
  )
## Ignoring unknown labels:
## • linetype : "sex"
## • shape : "sex"

Public health interpretation:

Part 4: Reflection

The analysis suggests that several individual and behavioral factors are associated with mentally unhealthy days among U.S. adults. In particular, physically unhealthy days appears to be the strongest predictor, as it had the largest F-statistic and a clear positive association with mentally unhealthy days. Other factors such as smoking, exercise, age group, income, and education also showed meaningful associations, although some specific categories within these variables were not statistically significant after adjustment. For example, certain income and education levels did not differ significantly from their reference groups. Because the BRFSS is a cross-sectional survey, these results cannot establish causality or temporal direction. It is possible that poor physical or mental health influences behaviors such as smoking or exercise, rather than the reverse. Additionally,confounders such as employment status, marital status, chronic disease burden, access to healthcare, and social support could bias the observed associations.

Adjusted R-squared provides additional information beyond regular R-squared because it accounts for the number of predictors included in the model and penalizes unnecessary complexity. This is especially important in this analysis, since multiple categorical variables such as age group, income, and education introduce many dummy variables, which can raise the regular R-squared even if they do not meaningfully improve model fit. Chunk tests are useful because they evaluate whether an entire set of related predictors, such as all income categories, contributes to the model as a whole. This is more informative than relying only on individual t-tests, since a categorical variable may be important overall even if not every category is statistically significant.

I used AI assistance to help structure my R Markdown file, troubleshoot coding issues, and refine the interpretation language for regression and hypothesis testing. For example, I asked for help correcting model specifications. I double checked the output by cross-checking all recoding steps with the assignment instructions, confirming that factor reference categories were correct, and making sure the analytic sample size was exactly 8,000 after applying set.seed, and reviewing all model results directly in R before writing my conclusions.