title: ” Multiple Linear Regression, Tests of Hypotheses, and Interaction Analysis” author: “Ummat Safwat Sristy” date: “4/5/2026” output: html_document: toc: true toc_float: true theme: flatly number_sections: true

Part 0: Data Acquisition and Preparation

Load Required Packages

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)

Import Raw Data

# Load the full XPT file 
brfss_raw <- read_xpt("C:\\Users\\safwa\\OneDrive - University at Albany - SUNY\\EPI 553\\Labs\\LLCP2023.XPT")

# After first load, save as RDS for faster future loading:
saveRDS(brfss_raw, "brfss_hw3_raw.rds")

# In subsequent sessions, load with:
brfss_raw <- readRDS("brfss_hw3_raw.rds")

# Report dimensions
cat("Rows:", nrow(brfss_raw), "\n")
## Rows: 433323
cat("Columns:", ncol(brfss_raw), "\n")
## Columns: 350

Select Required Variables

brfss_selected <- brfss_raw |>
  select(
    MENTHLTH,
    PHYSHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`
  )

cat("C:\\Users\\safwa\\OneDrive - University at Albany - SUNY\\EPI 553\\Labs\\LLCP2023.XPT")
## C:\Users\safwa\OneDrive - University at Albany - SUNY\EPI 553\Labs\LLCP2023.XPT

Recode Variables

brfss_clean <- brfss_selected |>
  mutate(
    # MENTHLTH: 88 = None (recode to 0), 77/99 = NA, 1-30 keep as-is
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
      TRUE ~ NA_real_
    ),

    # PHYSHLTH: same rules as MENTHLTH
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
      TRUE ~ NA_real_
    ),

    # BMI: divide by 100, 9999 = NA
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    ),

    # Sex: 1 = Male, 2 = Female
    sex = factor(
      case_when(
        SEXVAR == 1 ~ "Male",
        SEXVAR == 2 ~ "Female",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female")
    ),

    # Exercise: 1 = Yes, 2 = No, 7/9 = NA
    exercise = factor(
      case_when(
        EXERANY2 == 1 ~ "Yes",
        EXERANY2 == 2 ~ "No",
        EXERANY2 %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Yes", "No")
    ),

    # Age group: 13 levels, 14 = NA
    age_group = factor(
      case_when(
        `_AGEG5YR` == 1  ~ "18-24",
        `_AGEG5YR` == 2  ~ "25-29",
        `_AGEG5YR` == 3  ~ "30-34",
        `_AGEG5YR` == 4  ~ "35-39",
        `_AGEG5YR` == 5  ~ "40-44",
        `_AGEG5YR` == 6  ~ "45-49",
        `_AGEG5YR` == 7  ~ "50-54",
        `_AGEG5YR` == 8  ~ "55-59",
        `_AGEG5YR` == 9  ~ "60-64",
        `_AGEG5YR` == 10 ~ "65-69",
        `_AGEG5YR` == 11 ~ "70-74",
        `_AGEG5YR` == 12 ~ "75-79",
        `_AGEG5YR` == 13 ~ "80+",
        `_AGEG5YR` == 14 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("18-24","25-29","30-34","35-39","40-44",
                 "45-49","50-54","55-59","60-64","65-69",
                 "70-74","75-79","80+")
    ),

    # Income: 7 levels, 9 = NA
    income = factor(
      case_when(
        `_INCOMG1` == 1 ~ "Less than $15,000",
        `_INCOMG1` == 2 ~ "$15,000 to $24,999",
        `_INCOMG1` == 3 ~ "$25,000 to $34,999",
        `_INCOMG1` == 4 ~ "$35,000 to $49,999",
        `_INCOMG1` == 5 ~ "$50,000 to $99,999",
        `_INCOMG1` == 6 ~ "$100,000 to $199,999",
        `_INCOMG1` == 7 ~ "$200,000 or more",
        `_INCOMG1` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than $15,000","$15,000 to $24,999",
                 "$25,000 to $34,999","$35,000 to $49,999",
                 "$50,000 to $99,999","$100,000 to $199,999",
                 "$200,000 or more")
    ),

    # Education: 6 levels (1-2 collapsed), 9 = NA
    education = factor(
      case_when(
        EDUCA %in% c(1, 2) ~ "Less than high school",
        EDUCA == 3 ~ "High school diploma or GED",
        EDUCA == 4 ~ "Some college or technical school",
        EDUCA == 5 ~ "College graduate",
        EDUCA == 6 ~ "Graduate or professional degree",
        EDUCA == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than high school",
                 "High school diploma or GED",
                 "Some college or technical school",
                 "College graduate",
                 "Graduate or professional degree")
    ),

    # Smoking: 4 levels, 9 = NA
    smoking = factor(
      case_when(
        `_SMOKER3` == 1 ~ "Current daily smoker",
        `_SMOKER3` == 2 ~ "Current some-day smoker",
        `_SMOKER3` == 3 ~ "Former smoker",
        `_SMOKER3` == 4 ~ "Never smoker",
        `_SMOKER3` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Current daily smoker","Current some-day smoker",
                 "Former smoker","Never smoker")
    )
  ) |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise,
         age_group, income, education, smoking)

Report Missingness

miss_summary <- brfss_clean |>
  summarise(
    across(everything(), ~ sum(is.na(.x))),
  ) |>
  pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") |>
  mutate(
    N_Total = nrow(brfss_clean),
    Pct_Missing = round(N_Missing / N_Total * 100, 2)
  )

miss_summary |>
  kable(caption = "Missing Values Before Exclusions") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Missing Values Before Exclusions
Variable N_Missing N_Total Pct_Missing
menthlth_days 8108 433323 1.87
physhlth_days 10785 433323 2.49
bmi 40535 433323 9.35
sex 0 433323 0.00
exercise 1251 433323 0.29
age_group 7779 433323 1.80
income 86623 433323 19.99
education 2325 433323 0.54
smoking 23062 433323 5.32

Create Analytic Dataset

set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
          age_group, income, education, smoking) |>
  slice_sample(n = 8000)

cat("Final analytic sample n =", nrow(brfss_analytic), "\n")
## Final analytic sample n = 8000

Descriptive Statistics Table

brfss_analytic |>
  tbl_summary(
    by = sex,
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 2,
    label = list(
      menthlth_days ~ "Mentally unhealthy days",
      physhlth_days ~ "Physically unhealthy days",
      bmi           ~ "BMI",
      exercise      ~ "Any exercise in past 30 days",
      age_group     ~ "Age group",
      income        ~ "Annual household income",
      education     ~ "Education",
      smoking       ~ "Smoking status"
    )
  ) |>
  add_overall() |>
  add_p() |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Statistics by Sex, BRFSS 2023 Analytic Sample (n = 8,000)**")
Table 1. Descriptive Statistics by Sex, BRFSS 2023 Analytic Sample (n = 8,000)
Characteristic Overall
N = 8,000
1
Male
N = 3,936
1
Female
N = 4,064
1
p-value2
Mentally unhealthy days 4.32 (8.13) 3.53 (7.53) 5.08 (8.60) <0.001
Physically unhealthy days 4.43 (8.69) 3.98 (8.41) 4.86 (8.94) <0.001
BMI 28.70 (6.50) 28.71 (5.97) 28.69 (6.98) 0.008
Any exercise in past 30 days 6,240 (78%) 3,146 (80%) 3,094 (76%) <0.001
Age group


<0.001
    18-24 406 (5.1%) 235 (6.0%) 171 (4.2%)
    25-29 408 (5.1%) 219 (5.6%) 189 (4.7%)
    30-34 463 (5.8%) 253 (6.4%) 210 (5.2%)
    35-39 565 (7.1%) 263 (6.7%) 302 (7.4%)
    40-44 582 (7.3%) 290 (7.4%) 292 (7.2%)
    45-49 518 (6.5%) 266 (6.8%) 252 (6.2%)
    50-54 608 (7.6%) 305 (7.7%) 303 (7.5%)
    55-59 660 (8.3%) 308 (7.8%) 352 (8.7%)
    60-64 787 (9.8%) 408 (10%) 379 (9.3%)
    65-69 901 (11%) 418 (11%) 483 (12%)
    70-74 808 (10%) 382 (9.7%) 426 (10%)
    75-79 663 (8.3%) 325 (8.3%) 338 (8.3%)
    80+ 631 (7.9%) 264 (6.7%) 367 (9.0%)
Annual household income


<0.001
    Less than $15,000 407 (5.1%) 160 (4.1%) 247 (6.1%)
    $15,000 to $24,999 641 (8.0%) 271 (6.9%) 370 (9.1%)
    $25,000 to $34,999 871 (11%) 376 (9.6%) 495 (12%)
    $35,000 to $49,999 1,067 (13%) 482 (12%) 585 (14%)
    $50,000 to $99,999 2,511 (31%) 1,251 (32%) 1,260 (31%)
    $100,000 to $199,999 1,865 (23%) 996 (25%) 869 (21%)
    $200,000 or more 638 (8.0%) 400 (10%) 238 (5.9%)
Education


0.003
    Less than high school 124 (1.6%) 75 (1.9%) 49 (1.2%)
    High school diploma or GED 252 (3.2%) 130 (3.3%) 122 (3.0%)
    Some college or technical school 1,827 (23%) 950 (24%) 877 (22%)
    College graduate 2,138 (27%) 1,018 (26%) 1,120 (28%)
    Graduate or professional degree 3,659 (46%) 1,763 (45%) 1,896 (47%)
Smoking status


<0.001
    Current daily smoker 658 (8.2%) 339 (8.6%) 319 (7.8%)
    Current some-day smoker 268 (3.4%) 151 (3.8%) 117 (2.9%)
    Former smoker 2,262 (28%) 1,207 (31%) 1,055 (26%)
    Never smoker 4,812 (60%) 2,239 (57%) 2,573 (63%)
1 Mean (SD); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

Part 1: Multiple Linear Regression

Step 1: Fit the Full Model

model_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

summary(model_full)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                8.99937    0.93968   9.577  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseNo                                 0.65116    0.21472   3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to $24,999                  -1.63703    0.46899  -3.491 0.000485
## income$25,000 to $34,999                  -2.07637    0.44797  -4.635 3.63e-06
## income$35,000 to $49,999                  -2.54685    0.43819  -5.812 6.40e-09
## income$50,000 to $99,999                  -3.05048    0.41069  -7.428 1.22e-13
## income$100,000 to $199,999                -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school diploma or GED        0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseNo                                ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to $24,999                  ***
## income$25,000 to $34,999                  ***
## income$35,000 to $49,999                  ***
## income$50,000 to $99,999                  ***
## income$100,000 to $199,999                ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

Step 2: Fitted Regression Equation

# Extract and round coefficients
coefs <- round(coef(model_full), 3)
coefs
##                               (Intercept) 
##                                     8.999 
##                             physhlth_days 
##                                     0.266 
##                                       bmi 
##                                     0.033 
##                                 sexFemale 
##                                     1.390 
##                                exerciseNo 
##                                     0.651 
##                            age_group25-29 
##                                    -1.057 
##                            age_group30-34 
##                                    -1.094 
##                            age_group35-39 
##                                    -1.811 
##                            age_group40-44 
##                                    -2.893 
##                            age_group45-49 
##                                    -3.056 
##                            age_group50-54 
##                                    -3.517 
##                            age_group55-59 
##                                    -4.496 
##                            age_group60-64 
##                                    -4.522 
##                            age_group65-69 
##                                    -5.578 
##                            age_group70-74 
##                                    -6.025 
##                            age_group75-79 
##                                    -6.287 
##                              age_group80+ 
##                                    -6.820 
##                  income$15,000 to $24,999 
##                                    -1.637 
##                  income$25,000 to $34,999 
##                                    -2.076 
##                  income$35,000 to $49,999 
##                                    -2.547 
##                  income$50,000 to $99,999 
##                                    -3.050 
##                income$100,000 to $199,999 
##                                    -3.500 
##                    income$200,000 or more 
##                                    -3.784 
##       educationHigh school diploma or GED 
##                                     0.080 
## educationSome college or technical school 
##                                     1.077 
##                 educationCollege graduate 
##                                     1.791 
##  educationGraduate or professional degree 
##                                     1.738 
##            smokingCurrent some-day smoker 
##                                    -1.587 
##                      smokingFormer smoker 
##                                    -1.990 
##                       smokingNever smoker 
##                                    -2.937

Step 3: Coefficient Interpretations

  • **physhlth_days (continuous): For each additional physically unhealthy day in the past 30 days, the number of mentally unhealthy days is estimated to increase by 0.266 days, holding all other variables constant.
  • bmi (continuous): For each one-unit increase in BMI, the number of mentally unhealthy days is estimated to increase by 0.033 days, holding all other variables constant
  • sexFemale vs. Male (reference): Females are estimated to report 1.390 more mentally unhealthy days in the past 30 days compared to males, holding all other variables constant.
  • **exerciseNo vs. Yes (reference):Those who did no exercise in the past 30 days are estimated to report 0.651 more mentally unhealthy days compared to those who did exercise, holding all other variables constant
  • **One age_group coefficient of your choice:Adults aged 65–69 are estimated to report 5.578 fewer mentally unhealthy days in the past 30 days compared to adults aged 18–24 (the reference group), holding all other variables constant
  • **Two income coefficients of your choice vs. “Less than $15,000” (reference): $15,000 to $24,999 vs. Less than $15,000 (reference): Adults in this income bracket are estimated to report 1.637 fewer mentally unhealthy days compared to those earning less than $15,000, holding all other variables constant. $100,000 to $199,999 vs. Less than $15,000 (reference): Adults in this income bracket are estimated to report 3.500 fewer mentally unhealthy days compared to those earning less than $15,000, holding all other variables constant. This suggests a fairly consistent income gradient, where higher income is associated with better mental health.

Step 4: Model-Level Statistics

glance(model_full) |>
  select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual) |>
  kable(digits = 4, caption = "Model-Level Statistics") |>
  kable_styling(bootstrap_options = c("striped", "hover"))
Model-Level Statistics
r.squared adj.r.squared sigma statistic p.value df df.residual
0.1853 0.1824 7.3516 62.5234 0 29 7970
  • **R-squared: 0.1853 The model’s predictors (physical health days, BMI, sex, exercise, age group, income, education, and smoking status) collectively explain 18.53% of the variance in mentally unhealthy days among U.S. adults in this sample. This indicates that while the predictors are statistically meaningful, the majority of variation in mental health outcomes is attributable to factors not captured in this model.

  • **Adjusted R-squared:0.1824. The Adjusted R-squared is 0.1824, slightly lower than the R-squared of 0.1853. Unlike R-squared, which increases automatically whenever a predictor is added regardless of whether it improves the model, Adjusted R-squared penalizes for the number of predictors included. The small difference (0.0029) between the two suggests that the predictors in this model are genuinely contributing explanatory power and that the model is not being inflated by unnecessary variables. This distinction is especially important here because several categorical variables (age group, income, education) each contribute multiple dummy variables.

  • **Root MSE (Residual Standard Error):7.352.The residual standard error is 7.352 days. This means that, on average, the model’s predicted number of mentally unhealthy days differs from the observed value by approximately 7.35 days. Given that the outcome ranges from 0 to 30 days, this represents a meaningful degree of prediction error, consistent with the relatively modest R-squared.

  • Overall F-test: State H₀, report F-statistic, df (numerator and denominator), p-value, and conclusion. H₀: All regression coefficients are simultaneously equal to zero — that is, none of the predictors are associated with mentally unhealthy days (β₁ = β₂ = … = β₂₉ = 0) Hₐ: At least one regression coefficient is not equal to zero F-statistic: 62.52 Degrees of freedom: 29 (numerator) and 7,970 (denominator) p-value: < 2.2 × 10⁻¹⁶ Conclusion: We reject H₀ at α = 0.05. There is overwhelming statistical evidence that at least one predictor in the model is associated with the number of mentally unhealthy days, after accounting for all other predictors. The model as a whole fits significantly better than an intercept-only (null) model.

Part 2: Tests of Hypotheses

Step 1: Type III Partial F-Tests

type3_results <- car::Anova(model_full, type = "III")
type3_results
## Anova Table (Type III tests)
## 
## Response: menthlth_days
##               Sum Sq   Df  F value    Pr(>F)    
## (Intercept)     4957    1  91.7191 < 2.2e-16 ***
## physhlth_days  37623    1 696.1198 < 2.2e-16 ***
## bmi              345    1   6.3879 0.0115095 *  
## sex             3724    1  68.9012 < 2.2e-16 ***
## exercise         497    1   9.1966 0.0024325 ** 
## age_group      30092   12  46.3986 < 2.2e-16 ***
## income          4734    6  14.5982 < 2.2e-16 ***
## education       1265    4   5.8521 0.0001064 ***
## smoking         5101    3  31.4613 < 2.2e-16 ***
## Residuals     430750 7970                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Display as formatted table
type3_results |>
  as.data.frame() |>
  rownames_to_column("Predictor") |>
  kable(digits = 4, caption = "Type III (Partial) Sums of Squares") |>
  kable_styling(bootstrap_options = c("striped", "hover"))
Type III (Partial) Sums of Squares
Predictor Sum Sq Df F value Pr(>F)
(Intercept) 4957.0906 1 91.7191 0.0000
physhlth_days 37622.7952 1 696.1198 0.0000
bmi 345.2408 1 6.3879 0.0115
sex 3723.8662 1 68.9012 0.0000
exercise 497.0434 1 9.1966 0.0024
age_group 30092.1774 12 46.3986 0.0000
income 4733.8943 6 14.5982 0.0000
education 1265.1504 4 5.8521 0.0001
smoking 5101.1076 3 31.4613 0.0000
Residuals 430750.0872 7970 NA NA

Step 2: Chunk Test for Income

# Reduced model excluding income
model_no_income <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + education + smoking,
  data = brfss_analytic
)

# Partial F-test comparing reduced vs. full
anova(model_no_income, model_full)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7976 435484                                  
## 2   7970 430750  6    4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H₀: After accounting for all other predictors in the model, the income coefficients are jointly equal to zero — that is, income (as a group) is not associated with mentally unhealthy days (β $15k–$25k = β$25k–$35k = β$35k–$50k = β$50k–$100k = β$100k–$200k = β$200k+ = 0) Hₐ: At least one income coefficient is not equal to zero — that is, income collectively improves model fit after accounting for all other predictors F-statistic: 14.598 Degrees of freedom: 6 (numerator — the number of income dummy variables being tested) and 7,970 (denominator — residual df from the full model) p-value: < 2.2 × 10⁻¹⁶ Conclusion: We reject H₀ at α = 0.05. There is very strong statistical evidence that income, as a group of predictors, is significantly associated with mentally unhealthy days after accounting for physical health days, BMI, sex, exercise, age group, education, and smoking status. Removing income from the model results in a significantly worse fit, indicating that income collectively contributes meaningful explanatory power beyond what the other predictors provide.

Step 3: Chunk Test for Education

# Reduced model excluding education
model_no_education <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + smoking,
  data = brfss_analytic
)

# Partial F-test comparing reduced vs. full
anova(model_no_education, model_full)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7974 432015                                  
## 2   7970 430750  4    1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H₀: After accounting for all other predictors in the model (physical health days, BMI, sex, exercise, age group, income, and smoking), the education coefficients are all simultaneously equal to zero — that is, education as a group is not associated with mentally unhealthy days (β_HS = β_SomeCollege = β_CollegeGrad = β_GradDegree = 0) Hₐ: At least one education coefficient is not equal to zero — that is, education as a group adds statistically significant explanatory power after accounting for all other predictors F-statistic: 5.8521 Degrees of freedom: 4 (numerator) and 7,970 (denominator) p-value: 0.0001064 Conclusion: We reject H₀ at α = 0.05. There is strong statistical evidence that education, considered as a group of predictors, is significantly associated with mentally unhealthy days after accounting for all other variables in the model. Adding the four education dummy variables to the model produces a meaningful improvement in fit beyond what physical health, BMI, sex, exercise, age group, income, and smoking already explain.

Step 4: Written Synthesis

Based on the Type III partial F-tests, physical health days, sex, exercise, and smoking status emerged as the strongest independent contributors to mentally unhealthy days, each showing highly statistically significant partial associations (p < 0.05) after accounting for all other predictors in the model. Age group also reached significance as a group, while the contributions of BMI were more modest. The chunk test for income — which jointly tested all six income dummy variables simultaneously revealed that income as a whole significantly improved model fit beyond the other predictors even if some individual income levels did not appear strongly significant in the individual t-test. this matters because a categorical variable with many levels can collectively explain meaningful variance even when no single level clears the significance threshold on its own. Similarly, the chunk test for education tested whether the four education dummy variables together added explanatory power, providing a single, more statistically powerful test of education’s overall contribution rather than fragmenting its effect across four separate comparisons. Together, the chunk tests complement the Type III results by answering a different and arguably more relevant question — not “is this specific level different from the reference?” but “does this entire variable belong in the model?” which is the appropriate inferential question when a categorical predictor is treated as a conceptual unit.

Part 3: Interaction Analysis

Step 1: Create Binary Smoking Variable

brfss_analytic <- brfss_analytic |>
  mutate(
    smoker_current = factor(
      case_when(
        smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
        smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
        TRUE ~ NA_character_
      ),
      levels = c("Non-smoker", "Current smoker")   # Non-smoker as reference
    )
  )

# Verify coding
table(brfss_analytic$smoker_current, useNA = "ifany")
## 
##     Non-smoker Current smoker 
##           7074            926

Step 2: Fit Model A (Main Effects Only) and Model B (With Interaction)

# Model A: main effects only
model_A <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

# Model B: with sex * smoker_current interaction
model_B <- lm(
  menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

summary(model_B)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.337  -3.837  -1.604   0.628  30.426 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.22653    0.90967   6.845 8.23e-12
## physhlth_days                              0.26859    0.01007  26.679  < 2e-16
## bmi                                        0.03309    0.01323   2.502 0.012380
## sexFemale                                  1.18553    0.17752   6.678 2.58e-11
## smoker_currentCurrent smoker               1.52079    0.36539   4.162 3.19e-05
## exerciseNo                                 0.67285    0.21496   3.130 0.001753
## age_group25-29                            -0.92018    0.51962  -1.771 0.076616
## age_group30-34                            -0.89242    0.50591  -1.764 0.077774
## age_group35-39                            -1.59287    0.48752  -3.267 0.001090
## age_group40-44                            -2.62864    0.48564  -5.413 6.39e-08
## age_group45-49                            -2.84255    0.49687  -5.721 1.10e-08
## age_group50-54                            -3.27784    0.48195  -6.801 1.11e-11
## age_group55-59                            -4.24987    0.47404  -8.965  < 2e-16
## age_group60-64                            -4.26404    0.45671  -9.336  < 2e-16
## age_group65-69                            -5.25062    0.44662 -11.756  < 2e-16
## age_group70-74                            -5.71106    0.45439 -12.569  < 2e-16
## age_group75-79                            -5.90758    0.47018 -12.565  < 2e-16
## age_group80+                              -6.49946    0.47359 -13.724  < 2e-16
## income$15,000 to $24,999                  -1.63574    0.46999  -3.480 0.000503
## income$25,000 to $34,999                  -2.07457    0.44862  -4.624 3.82e-06
## income$35,000 to $49,999                  -2.54551    0.43915  -5.796 7.03e-09
## income$50,000 to $99,999                  -3.04298    0.41157  -7.394 1.57e-13
## income$100,000 to $199,999                -3.50972    0.42999  -8.162 3.79e-16
## income$200,000 or more                    -3.84047    0.50103  -7.665 2.00e-14
## educationHigh school diploma or GED        0.12556    0.81238   0.155 0.877177
## educationSome college or technical school  1.11789    0.69123   1.617 0.105865
## educationCollege graduate                  1.81788    0.69283   2.624 0.008711
## educationGraduate or professional degree   1.66905    0.69428   2.404 0.016240
## sexFemale:smoker_currentCurrent smoker     1.28327    0.51705   2.482 0.013089
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              ***
## exerciseNo                                ** 
## age_group25-29                            .  
## age_group30-34                            .  
## age_group35-39                            ** 
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to $24,999                  ***
## income$25,000 to $34,999                  ***
## income$35,000 to $49,999                  ***
## income$50,000 to $99,999                  ***
## income$100,000 to $199,999                ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## sexFemale:smoker_currentCurrent smoker    *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared:  0.1826, Adjusted R-squared:  0.1798 
## F-statistic: 63.61 on 28 and 7971 DF,  p-value: < 2.2e-16

Step 3: Test Whether Interaction is Statistically Significant

anova(model_A, model_B)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   7972 432509                              
## 2   7971 432175  1    333.97 6.1598 0.01309 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H₀: The association between current smoking status and mentally unhealthy days does not differ by sex — i.e., the interaction coefficient β(sex × smoker_current) = 0. Hₐ: The association between current smoking status and mentally unhealthy days does differ by sex — i.e., β(sex × smoker_current) ≠ 0.

F-statistic: 6.16 Degrees of freedom: 1 (numerator), 7971 (denominator) p-value: 0.0131 Conclusion: At α = 0.05, we reject H₀ (p = 0.013 < 0.05). There is statistically significant evidence that the association between current smoking and the number of mentally unhealthy days differs by sex. The interaction term improves model fit beyond the main effects alone, supporting the presence of effect modification by sex.

Step 4: Interpret the Interaction Coefficients

# Extract relevant coefficients
tidy(model_B) |>
  filter(str_detect(term, "smoker_current|sex")) |>
  kable(digits = 4, caption = "Interaction-Related Coefficients from Model B") |>
  kable_styling(bootstrap_options = c("striped", "hover"))
Interaction-Related Coefficients from Model B
term estimate std.error statistic p.value
sexFemale 1.1855 0.1775 6.6784 0.0000
smoker_currentCurrent smoker 1.5208 0.3654 4.1621 0.0000
sexFemale:smoker_currentCurrent smoker 1.2833 0.5171 2.4819 0.0131
b_smoker   <- coef(model_B)["smoker_currentCurrent smoker"]
b_interact <- coef(model_B)["sexFemale:smoker_currentCurrent smoker"]

cat("Effect of current smoking among Men (reference):", round(b_smoker, 3), "\n")
## Effect of current smoking among Men (reference): 1.521
cat("Effect of current smoking among Women:", round(b_smoker + b_interact, 3), "\n")
## Effect of current smoking among Women: 2.804

Among men: The coefficient for smoker_currentCurrent smoker is 1.521. Among men, current smokers are associated with 1.521 more mentally unhealthy days in the past 30 days compared to non-smokers, holding all other variables constant.

Among women: The estimated effect among women is computed as: 1.5208 + 1.2833 = 2.8041 Among women, current smokers are associated with 2.804 more mentally unhealthy days in the past 30 days compared to non-smokers, holding all other variables constant.

The association between current smoking and mentally unhealthy days is stronger among women than among men. Specifically, the estimated difference between current smokers and non-smokers is approximately 2.804 days among women versus 1.521 days among men a difference of about 1.283 days, which is precisely the interaction coefficient. This suggests that sex modifies the association between smoking and mental health, with women who smoke reporting a notably larger number of mentally unhealthy days relative to non-smoking women than is seen among men

Step 5: Visualize the Interaction

# Generate predicted values
pred_interaction <- ggpredict(model_B, terms = c("smoker_current", "sex"))

# Plot
plot(pred_interaction) +
  labs(
    title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
    x = "Smoking Status",
    y = "Predicted Mentally Unhealthy Days (Past 30)",
    color = "Sex"
  ) +
  theme_minimal(base_size = 13)

Step 6: Public Health Interpretation

A. Findings and Limitations (6 pts)

The findings indicate that physical health, smoking status, sex, and income are among the most influential predictors of mentally unhealthy days in this sample. Physically unhealthy days exhibited the strongest positive relationship with mental health burden, reinforcing the well-documented connection between physical and mental well-being. Being female and currently smoking were both independently linked to a higher number of mentally unhealthy days, whereas higher income was associated with fewer such days. In contrast, variables like BMI and certain education categories showed weaker or non-significant associations, suggesting that their effects on mental health may be more complex or operate through indirect pathways. A major limitation of cross-sectional data is that exposures and outcomes are measured simultaneously, preventing the establishment of temporal order and leaving open the possibility of reverse causation—for instance, poor mental health could contribute to smoking rather than result from it. Additionally, the BRFSS relies on self-reported information, which may introduce recall bias and social desirability bias. Two notable confounders that were not accounted for include (1) prior diagnosis of mental illness, which is likely associated with both smoking and increased mentally unhealthy days, and (2) social support or relationship status, which can influence mental health and may also be related to smoking behavior and sex.

B. From Simple to Multiple Regression (4 pts)

Regular R-squared increases mechanically every time a predictor is added to a model, even if that predictor has no true relationship with the outcome. Adjusted R-squared penalizes for the number of predictors, only increasing when a new variable improves fit more than expected by chance. This distinction is especially important here because categorical variables like age group (12 dummy variables), income (6 dummies), and education (4 dummies) collectively add many degrees of freedom — R-squared would inflate substantially even if these variables contributed little real explanatory power. Chunk tests (partial F-tests) are valuable because categorical variables with many levels are represented by multiple dummy coefficients in the model. Evaluating each level’s individual t-test in summary() only tells you whether that specific category differs from the reference — it does not tell you whether the variable as a whole improves model fit. A chunk test jointly tests all levels simultaneously, answering the more meaningful question: does income (or education) as a construct belong in the model at all, after accounting for all other predictors?

C. AI Transparency (5 pts)

I used AI in code debugging.I also used AI to understand the data set as the variables name is provided in the assignment instruction in capital letter where as the original data set have all the variables written in small letter. I asked AI why my data set was not loading. It suggested me to run names() to see all the variables that how I found out why the error was happening.