EPI 553 — Homework 3: Regression, Confounding/Interaction, & Model Selection

Submission: Knit this file to HTML, publish to RPubs with the title epi553_hw03_lastname_firstname, and paste the RPubs URL in the Brightspace submission comments. Also upload this .Rmd file to Brightspace.

Part 0: Data Preparation (10 points)

# Load required packages
library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)

# Import the dataset — update the path if needed
brfss_raw <- read_xpt("LLCP2023.XPT")

# Report dimensions of raw dataset
dim(brfss_raw)

## [1] 433323    350

nrow(brfss_raw)

## [1] 433323

ncol(brfss_raw)

## [1] 350

# Optional narrative output
cat("The raw BRFSS dataset contains", nrow(brfss_raw), "rows and", ncol(brfss_raw), "columns.\n")

## The raw BRFSS dataset contains 433323 rows and 350 columns.

# Select the nine required variables
brfss <- brfss_raw %>%
  select(
    MENTHLTH,
    PHYSHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`
  )

# Recode all variables
brfss_clean <- brfss %>%
  mutate(
    # Mentally unhealthy days
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
      TRUE ~ NA_real_
    ),

    # Physically unhealthy days
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
      TRUE ~ NA_real_
    ),

    # BMI
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    ),

    # Sex
    sex = case_when(
      SEXVAR == 1 ~ "Male",
      SEXVAR == 2 ~ "Female",
      TRUE ~ NA_character_
    ),
    sex = factor(sex, levels = c("Male", "Female")),

    # Exercise
    exercise = case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      EXERANY2 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),
    exercise = factor(exercise, levels = c("No", "Yes")),

    # Age group
    age_group = case_when(
      `_AGEG5YR` == 1  ~ "18-24",
      `_AGEG5YR` == 2  ~ "25-29",
      `_AGEG5YR` == 3  ~ "30-34",
      `_AGEG5YR` == 4  ~ "35-39",
      `_AGEG5YR` == 5  ~ "40-44",
      `_AGEG5YR` == 6  ~ "45-49",
      `_AGEG5YR` == 7  ~ "50-54",
      `_AGEG5YR` == 8  ~ "55-59",
      `_AGEG5YR` == 9  ~ "60-64",
      `_AGEG5YR` == 10 ~ "65-69",
      `_AGEG5YR` == 11 ~ "70-74",
      `_AGEG5YR` == 12 ~ "75-79",
      `_AGEG5YR` == 13 ~ "80+",
      `_AGEG5YR` == 14 ~ NA_character_,
      TRUE ~ NA_character_
    ),
    age_group = factor(
      age_group,
      levels = c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49",
                 "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+")
    ),

    # Income
    income = case_when(
      `_INCOMG1` == 1 ~ "Less than $15,000",
      `_INCOMG1` == 2 ~ "$15,000 to < $25,000",
      `_INCOMG1` == 3 ~ "$25,000 to < $35,000",
      `_INCOMG1` == 4 ~ "$35,000 to < $50,000",
      `_INCOMG1` == 5 ~ "$50,000 to < $100,000",
      `_INCOMG1` == 6 ~ "$100,000 to < $200,000",
      `_INCOMG1` == 7 ~ "$200,000 or more",
      `_INCOMG1` == 9 ~ NA_character_,
      TRUE ~ NA_character_
    ),
    income = factor(
      income,
      levels = c("Less than $15,000",
                 "$15,000 to < $25,000",
                 "$25,000 to < $35,000",
                 "$35,000 to < $50,000",
                 "$50,000 to < $100,000",
                 "$100,000 to < $200,000",
                 "$200,000 or more")
    ),

    # Education
    education = case_when(
      EDUCA %in% c(1, 2) ~ "Less than high school",
      EDUCA == 3 ~ "High school diploma or GED",
      EDUCA == 4 ~ "Some college or technical school",
      EDUCA == 5 ~ "College graduate",
      EDUCA == 6 ~ "Graduate or professional degree",
      EDUCA == 9 ~ NA_character_,
      TRUE ~ NA_character_
    ),
    education = factor(
      education,
      levels = c("Less than high school",
                 "High school diploma or GED",
                 "Some college or technical school",
                 "College graduate",
                 "Graduate or professional degree")
    ),

    # Smoking
    smoking = case_when(
      `_SMOKER3` == 1 ~ "Current daily smoker",
      `_SMOKER3` == 2 ~ "Current some-day smoker",
      `_SMOKER3` == 3 ~ "Former smoker",
      `_SMOKER3` == 4 ~ "Never smoker",
      `_SMOKER3` == 9 ~ NA_character_,
      TRUE ~ NA_character_
    ),
    smoking = factor(
      smoking,
      levels = c("Current daily smoker",
                 "Current some-day smoker",
                 "Former smoker",
                 "Never smoker")
    )
  ) %>%
  select(
    menthlth_days,
    physhlth_days,
    bmi,
    sex,
    exercise,
    age_group,
    income,
    education,
    smoking
  )

# Quick structure check
glimpse(brfss_clean)

## Rows: 433,323
## Columns: 9
## $ menthlth_days <dbl> 0, 0, 2, 0, 0, 3, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, …
## $ physhlth_days <dbl> 0, 0, 6, 2, 0, 2, 8, 1, 5, 0, 0, 2, 0, 0, 0, 4, 30, 15, …
## $ bmi           <dbl> 30.47, 28.56, 22.31, 27.44, 25.85, 30.18, 24.41, 27.27, …
## $ sex           <fct> Female, Female, Female, Female, Female, Female, Male, Fe…
## $ exercise      <fct> No, Yes, Yes, Yes, Yes, Yes, No, No, No, Yes, Yes, No, Y…
## $ age_group     <fct> 80+, 80+, 80+, 75-79, 75-79, 60-64, 80+, 75-79, 80+, 75-…
## $ income        <fct> NA, NA, "Less than $15,000", NA, "$50,000 to < $100,000"…
## $ education     <fct> College graduate, College graduate, Some college or tech…
## $ smoking       <fct> Never smoker, Never smoker, Former smoker, Never smoker,…

# Report number and percentage missing for menthlth_days
#    and at least two other key variables, BEFORE removing missingness
missingness_table <- brfss_clean %>%
  summarize(
    menthlth_days_missing_n = sum(is.na(menthlth_days)),
    menthlth_days_missing_pct = mean(is.na(menthlth_days)) * 100,

    physhlth_days_missing_n = sum(is.na(physhlth_days)),
    physhlth_days_missing_pct = mean(is.na(physhlth_days)) * 100,

    bmi_missing_n = sum(is.na(bmi)),
    bmi_missing_pct = mean(is.na(bmi)) * 100,

    smoking_missing_n = sum(is.na(smoking)),
    smoking_missing_pct = mean(is.na(smoking)) * 100
  )

missingness_table

## # A tibble: 1 × 8
##   menthlth_days_missing_n menthlth_days_missing_pct physhlth_days_missing_n
##                     <int>                     <dbl>                   <int>
## 1                    8108                      1.87                   10785
## # ℹ 5 more variables: physhlth_days_missing_pct <dbl>, bmi_missing_n <int>,
## #   bmi_missing_pct <dbl>, smoking_missing_n <int>, smoking_missing_pct <dbl>

# Optional prettier missingness table
missingness_long <- tibble(
  variable = c("menthlth_days", "physhlth_days", "bmi", "smoking"),
  missing_n = c(
    sum(is.na(brfss_clean$menthlth_days)),
    sum(is.na(brfss_clean$physhlth_days)),
    sum(is.na(brfss_clean$bmi)),
    sum(is.na(brfss_clean$smoking))
  ),
  missing_pct = c(
    mean(is.na(brfss_clean$menthlth_days)) * 100,
    mean(is.na(brfss_clean$physhlth_days)) * 100,
    mean(is.na(brfss_clean$bmi)) * 100,
    mean(is.na(brfss_clean$smoking)) * 100
  )
) %>%
  mutate(missing_pct = round(missing_pct, 2))

missingness_long %>%
  kbl(caption = "Missingness Before Creating Analytic Dataset",
      col.names = c("Variable", "Missing N", "Missing %")) %>%
  kable_classic(full_width = FALSE)

Missingness Before Creating Analytic Dataset
Variable	Missing N	Missing %
menthlth_days	8108	1.87
physhlth_days	10785	2.49
bmi	40535	9.35
smoking	23062	5.32

# Create analytic dataset
set.seed(1220)

brfss_analytic <- brfss_clean %>%
  drop_na(
    menthlth_days,
    physhlth_days,
    bmi,
    sex,
    exercise,
    age_group,
    income,
    education,
    smoking
  ) %>%
  slice_sample(n = 8000)

# Report final analytic n
nrow(brfss_analytic)

## [1] 8000

cat("Final analytic sample size:", nrow(brfss_analytic), "\n")

## Final analytic sample size: 8000

# 12. Produce descriptive statistics table stratified by sex
table1 <- brfss_analytic %>%
  tbl_summary(
    by = sex,
    statistic = list(
      all_continuous() ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 2,
    missing = "no"
  ) %>%
  add_overall() %>%
  bold_labels()

table1

Characteristic	Overall N = 8,000¹	Male N = 3,936¹	Female N = 4,064¹
menthlth_days	4.32 (8.13)	3.53 (7.53)	5.08 (8.60)
physhlth_days	4.43 (8.69)	3.98 (8.41)	4.86 (8.94)
bmi	28.70 (6.50)	28.71 (5.97)	28.69 (6.98)
exercise	6,240 (78%)	3,146 (80%)	3,094 (76%)
age_group
18-24	406 (5.1%)	235 (6.0%)	171 (4.2%)
25-29	408 (5.1%)	219 (5.6%)	189 (4.7%)
30-34	463 (5.8%)	253 (6.4%)	210 (5.2%)
35-39	565 (7.1%)	263 (6.7%)	302 (7.4%)
40-44	582 (7.3%)	290 (7.4%)	292 (7.2%)
45-49	518 (6.5%)	266 (6.8%)	252 (6.2%)
50-54	608 (7.6%)	305 (7.7%)	303 (7.5%)
55-59	660 (8.3%)	308 (7.8%)	352 (8.7%)
60-64	787 (9.8%)	408 (10%)	379 (9.3%)
65-69	901 (11%)	418 (11%)	483 (12%)
70-74	808 (10%)	382 (9.7%)	426 (10%)
75-79	663 (8.3%)	325 (8.3%)	338 (8.3%)
80+	631 (7.9%)	264 (6.7%)	367 (9.0%)
income
Less than $15,000	407 (5.1%)	160 (4.1%)	247 (6.1%)
$15,000 to < $25,000	641 (8.0%)	271 (6.9%)	370 (9.1%)
$25,000 to < $35,000	871 (11%)	376 (9.6%)	495 (12%)
$35,000 to < $50,000	1,067 (13%)	482 (12%)	585 (14%)
$50,000 to < $100,000	2,511 (31%)	1,251 (32%)	1,260 (31%)
$100,000 to < $200,000	1,865 (23%)	996 (25%)	869 (21%)
$200,000 or more	638 (8.0%)	400 (10%)	238 (5.9%)
education
Less than high school	124 (1.6%)	75 (1.9%)	49 (1.2%)
High school diploma or GED	252 (3.2%)	130 (3.3%)	122 (3.0%)
Some college or technical school	1,827 (23%)	950 (24%)	877 (22%)
College graduate	2,138 (27%)	1,018 (26%)	1,120 (28%)
Graduate or professional degree	3,659 (46%)	1,763 (45%)	1,896 (47%)
smoking
Current daily smoker	658 (8.2%)	339 (8.6%)	319 (7.8%)
Current some-day smoker	268 (3.4%)	151 (3.8%)	117 (2.9%)
Former smoker	2,262 (28%)	1,207 (31%)	1,055 (26%)
Never smoker	4,812 (60%)	2,239 (57%)	2,573 (63%)
¹ Mean (SD); n (%)

# Save cleaned datasets so you do not have to reload the XPT every time
saveRDS(brfss_clean, "brfss_clean.rds")
saveRDS(brfss_analytic, "brfss_analytic.rds")

Step 10 Interpretation:

Before removing missing values, there were 8,108 missing observations (1.87%) for menthlth_days.

For physhlth_days, there were 10,785 missing observations (2.49%).

For bmi, there were 40,535 missing observations (9.35%), representing the highest proportion of missing data among the variables examined.

Additionally, smoking had 23,062 missing observations (5.32%).

Step 11 Interpretation:

The analytic dataset was created by removing observations with missing values on all nine analysis variables using drop_na(), followed by drawing a random sample of 8,000 observations using slice_sample() with set.seed(1220) to ensure reproducibility. The final analytic sample size was n = 8,000.

Part 1: Multiple Linear Regression (25 Points)

Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?

# =========================================================
# Part 1: Multiple Linear Regression
# =========================================================

# 1. Fit the model
model <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

# Display full model output
summary(model)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                9.65053    0.95407  10.115  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to < $25,000                -1.63703    0.46899  -3.491 0.000485
## income$25,000 to < $35,000                -2.07637    0.44797  -4.635 3.63e-06
## income$35,000 to < $50,000                -2.54685    0.43819  -5.812 6.40e-09
## income$50,000 to < $100,000               -3.05048    0.41069  -7.428 1.22e-13
## income$100,000 to < $200,000              -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school diploma or GED        0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to < $25,000                ***
## income$25,000 to < $35,000                ***
## income$35,000 to < $50,000                ***
## income$50,000 to < $100,000               ***
## income$100,000 to < $200,000              ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

# Step 2. Extract coefficients (for regression equation) 
#Rounded coefficients for equation
coef_table <- round(coef(model), 3)
coef_table

##                               (Intercept) 
##                                     9.651 
##                             physhlth_days 
##                                     0.266 
##                                       bmi 
##                                     0.033 
##                                 sexFemale 
##                                     1.390 
##                               exerciseYes 
##                                    -0.651 
##                            age_group25-29 
##                                    -1.057 
##                            age_group30-34 
##                                    -1.094 
##                            age_group35-39 
##                                    -1.811 
##                            age_group40-44 
##                                    -2.893 
##                            age_group45-49 
##                                    -3.056 
##                            age_group50-54 
##                                    -3.517 
##                            age_group55-59 
##                                    -4.496 
##                            age_group60-64 
##                                    -4.522 
##                            age_group65-69 
##                                    -5.578 
##                            age_group70-74 
##                                    -6.025 
##                            age_group75-79 
##                                    -6.287 
##                              age_group80+ 
##                                    -6.820 
##                income$15,000 to < $25,000 
##                                    -1.637 
##                income$25,000 to < $35,000 
##                                    -2.076 
##                income$35,000 to < $50,000 
##                                    -2.547 
##               income$50,000 to < $100,000 
##                                    -3.050 
##              income$100,000 to < $200,000 
##                                    -3.500 
##                    income$200,000 or more 
##                                    -3.784 
##       educationHigh school diploma or GED 
##                                     0.080 
## educationSome college or technical school 
##                                     1.077 
##                 educationCollege graduate 
##                                     1.791 
##  educationGraduate or professional degree 
##                                     1.738 
##            smokingCurrent some-day smoker 
##                                    -1.587 
##                      smokingFormer smoker 
##                                    -1.990 
##                       smokingNever smoker 
##                                    -2.937

Step 2 Interpretation:

The fitted regression equation is:

menthlth_days = 9.651 + 0.266(physhlth_days) + 0.033(bmi) + 1.390(Female) − 0.651(Exercise: Yes) − 1.057(age 25–29) − 1.094(age 30–34) − 1.811(age 35–39) − 2.893(age 40–44) − 3.056(age 45–49) − 3.517(age 50–54) − 4.496(age 55–59) − 4.522(age 60–64) − 5.578(age 65–69) − 6.025(age 70–74) − 6.287(age 75–79) − 6.820(age 80+) − 1.637($15k–<$25k) − 2.076($25k–<$35k) − 2.547($35k–<$50k) − 3.050($50k–<$100k) − 3.500($100k–<$200k) − 3.784($200k+)

0.080(HS/GED) + 1.077(Some college) + 1.791(College graduate) + 1.738(Graduate degree) − 1.990(Current some-day smoker) − 1.587(Former smoker) − 2.937(Never smoker)

Step 3 Interpretation:

Coefficients:

physhlth_days: For each additional physically unhealthy day, the number of mentally unhealthy days is expected to increase by 0.266 days, holding all other variables constant.

bmi: For each one-unit increase in BMI, mentally unhealthy days increase by 0.033 days, holding all other variables constant.

sex (Female vs Male): Compared to males, females report 1.390 more mentally unhealthy days, holding all other variables constant.

exercise (Yes vs No): Individuals who exercised reported 0.651 fewer mentally unhealthy days compared to those who did not exercise, holding all other variables constant.

age_group (example: 25–29): Compared to adults aged 18–24, individuals aged 25–29 report 1.057 fewer mentally unhealthy days, holding all other variables constant.

income (example 1: $50k–$100k): Compared to individuals earning less than $15,000, those earning $50,000 to < $100,000 report 3.050 fewer mentally unhealthy days, holding all other variables constant.

income (example 2: $200k+): Compared to individuals earning less than $15,000, those earning $200,000 or more report 3.784 fewer mentally unhealthy days, holding all other variables constant.

Step 4 Interpretation:

R-squared

The R-squared value is 0.185, indicating that approximately 18.5% of the variability in mentally unhealthy days is explained by the predictors included in the model.

Adjusted R-squared

The adjusted R-squared is 0.182, which is slightly lower than the R-squared because it accounts for the number of predictors in the model. This provides a more accurate estimate of model fit by penalizing the inclusion of unnecessary variables.

Root MSE (Residual Standard Error)

The residual standard error (Root MSE) is 7.352, meaning that the model’s predicted mentally unhealthy days differ from the observed values by about 7.35 days on average.

Overall F-test

Hypotheses:

The null hypothesis (H₀) is that all regression coefficients are equal to zero (i.e., none of the predictors are associated with mentally unhealthy days). The alternative hypothesis (H₁) is that at least one predictor is associated with mentally unhealthy days.

Test statistic:

The overall F-test was F(29, 7970) = 62.52, with a p-value < 0.001.

Conclusion:

Since the p-value is less than 0.05, we reject the null hypothesis and conclude that the model provides a statistically significant fit. This indicates that at least one predictor is significantly associated with mentally unhealthy days. However, while the model explains a meaningful portion of variation in mental health, substantial unexplained variability remains, suggesting that other important factors are not captured.

Part 2: Tests of Hypotheses (20 Points)

# =========================================================
# Part 2: Tests of Hypotheses
# =========================================================

library(car)

# Full model from Part 1
model_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

# ---------------------------------------------------------
# Step 1: Type III (partial) sums of squares
# ---------------------------------------------------------
type3_results <- car::Anova(model_full, type = "III")
type3_results

## Anova Table (Type III tests)
## 
## Response: menthlth_days
##               Sum Sq   Df  F value    Pr(>F)    
## (Intercept)     5530    1 102.3152 < 2.2e-16 ***
## physhlth_days  37623    1 696.1198 < 2.2e-16 ***
## bmi              345    1   6.3879 0.0115095 *  
## sex             3724    1  68.9012 < 2.2e-16 ***
## exercise         497    1   9.1966 0.0024325 ** 
## age_group      30092   12  46.3986 < 2.2e-16 ***
## income          4734    6  14.5982 < 2.2e-16 ***
## education       1265    4   5.8521 0.0001064 ***
## smoking         5101    3  31.4613 < 2.2e-16 ***
## Residuals     430750 7970                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Optional: identify significant predictors at alpha = 0.05
type3_df <- as.data.frame(type3_results)
type3_df$Predictor <- rownames(type3_df)
type3_sig <- type3_df %>%
  dplyr::filter(`Pr(>F)` < 0.05)

type3_sig

##                   Sum Sq Df    F value        Pr(>F)     Predictor
## (Intercept)    5529.7722  1 102.315207  6.599585e-24   (Intercept)
## physhlth_days 37622.7952  1 696.119831 3.785884e-147 physhlth_days
## bmi             345.2408  1   6.387855  1.150955e-02           bmi
## sex            3723.8662  1  68.901236  1.205544e-16           sex
## exercise        497.0434  1   9.196599  2.432451e-03      exercise
## age_group     30092.1774 12  46.398647 1.377631e-107     age_group
## income         4733.8943  6  14.598232  1.192413e-16        income
## education      1265.1504  4   5.852145  1.064433e-04     education
## smoking        5101.1076  3  31.461265  3.285702e-20       smoking

# ---------------------------------------------------------
# Step 2: Chunk test for income
# Reduced model excludes income
# ---------------------------------------------------------
model_no_income <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + education + smoking,
  data = brfss_analytic
)

income_chunk_test <- anova(model_no_income, model_full)
income_chunk_test

## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7976 435484                                  
## 2   7970 430750  6    4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# ---------------------------------------------------------
# Step 3: Chunk test for education
# Reduced model excludes education
# ---------------------------------------------------------
model_no_education <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + smoking,
  data = brfss_analytic
)

education_chunk_test <- anova(model_no_education, model_full)
education_chunk_test

## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7974 432015                                  
## 2   7970 430750  4    1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 1 Interpretation: Type III (partial) sums of squares

The Type III sums of squares table was used to assess the partial association of each predictor with mentally unhealthy days, controlling for all other variables in the model. At α = 0.05, the following predictors had statistically significant partial associations with mentally unhealthy days: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking (all p-values < 0.05).

Step 2 Interpretation: Chunk test for income

Hypotheses

H₀: All income coefficients are equal to 0 after accounting for the other predictors.

Hₐ: At least one income coefficient is not equal to 0 after accounting for the other predictors.

The chunk test comparing the reduced model (without income) to the full model yielded F(6, 7970) = 14.60, with p < 0.001.

Since the p-value is less than 0.05, we reject the null hypothesis and conclude that income as a group significantly improves the model after accounting for the other predictors.

Step 3 Interpretation: Chunk test for education

Hypotheses

H₀: All education coefficients are equal to 0 after accounting for the other predictors. Hₐ: At least one education coefficient is not equal to 0 after accounting for the other predictors.

Test results

The chunk test comparing the reduced model (without education) to the full model yielded F(4, 7970) = 5.85, with p < 0.001.

Conclusion

Since the p-value is less than 0.05, we reject the null hypothesis and conclude that education as a group significantly improves the model after accounting for the other predictors.

Step 4 Interpretation:

The Type III results indicate that all predictors, including physical health days, BMI, sex, exercise, age group, income, education, and smoking, have statistically significant partial associations with mentally unhealthy days after adjusting for other variables. Among these, physhlth_days and age_group appear to make the strongest independent contributions based on their large F-statistics. The chunk test for income demonstrated that income as a group significantly improves model fit beyond the other predictors, and the chunk test for education showed a similar significant contribution. These chunk tests provide additional insight beyond individual t-tests by evaluating whether groups of related variables jointly contribute to the model, even if some individual categories may not be significant on their own. Overall, physical health and smoking emerged as the strongest independent predictors, while education showed weaker and less consistent associations after adjustment.

Part 3: Interaction Analysis (25 Points)

Background: Effect modification occurs when the association between a predictor and an outcome differs across levels of another variable. You will test whether the association between current smoking and mentally unhealthy days differs between men and women.

# =========================================================
# Part 3: Interaction Analysis
# =========================================================

library(tidyverse)
library(ggeffects)

# Step 1: Create binary smoking variable
brfss_analytic <- brfss_analytic %>%
  mutate(
    smoker_current = case_when(
      smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
      smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
      TRUE ~ NA_character_
    ),
    smoker_current = factor(smoker_current, levels = c("Non-smoker", "Current smoker"))
  )

# Check counts
table(brfss_analytic$smoker_current, useNA = "ifany")

## 
##     Non-smoker Current smoker 
##           7074            926

# Step 2: Fit Model A (main effects only)
model_A <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

summary(model_A)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2913  -3.8732  -1.6219   0.6681  30.4937 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.75533    0.92706   7.287 3.48e-13
## physhlth_days                              0.26862    0.01007  26.673  < 2e-16
## bmi                                        0.03341    0.01323   2.525 0.011589
## sexFemale                                  1.33309    0.16732   7.967 1.84e-15
## smoker_currentCurrent smoker               2.12874    0.27121   7.849 4.74e-15
## exerciseYes                               -0.67248    0.21503  -3.127 0.001770
## age_group25-29                            -0.91492    0.51978  -1.760 0.078412
## age_group30-34                            -0.88226    0.50606  -1.743 0.081301
## age_group35-39                            -1.58102    0.48765  -3.242 0.001191
## age_group40-44                            -2.61572    0.48577  -5.385 7.46e-08
## age_group45-49                            -2.82462    0.49698  -5.684 1.37e-08
## age_group50-54                            -3.26004    0.48206  -6.763 1.45e-11
## age_group55-59                            -4.23010    0.47413  -8.922  < 2e-16
## age_group60-64                            -4.24840    0.45682  -9.300  < 2e-16
## age_group65-69                            -5.23383    0.44671 -11.716  < 2e-16
## age_group70-74                            -5.70233    0.45453 -12.546  < 2e-16
## age_group75-79                            -5.89774    0.47031 -12.540  < 2e-16
## age_group80+                              -6.48879    0.47372 -13.697  < 2e-16
## income$15,000 to < $25,000                -1.67974    0.46981  -3.575 0.000352
## income$25,000 to < $35,000                -2.10230    0.44863  -4.686 2.83e-06
## income$35,000 to < $50,000                -2.58691    0.43898  -5.893 3.95e-09
## income$50,000 to < $100,000               -3.08227    0.41140  -7.492 7.50e-14
## income$100,000 to < $200,000              -3.53598    0.43000  -8.223 2.30e-16
## income$200,000 or more                    -3.86251    0.50112  -7.708 1.43e-14
## educationHigh school diploma or GED        0.21386    0.81186   0.263 0.792237
## educationSome college or technical school  1.19653    0.69073   1.732 0.083264
## educationCollege graduate                  1.90349    0.69219   2.750 0.005974
## educationGraduate or professional degree   1.74564    0.69382   2.516 0.011890
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              ***
## exerciseYes                               ** 
## age_group25-29                            .  
## age_group30-34                            .  
## age_group35-39                            ** 
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to < $25,000                ***
## income$25,000 to < $35,000                ***
## income$35,000 to < $50,000                ***
## income$50,000 to < $100,000               ***
## income$100,000 to < $200,000              ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school .  
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.366 on 7972 degrees of freedom
## Multiple R-squared:  0.182,  Adjusted R-squared:  0.1792 
## F-statistic:  65.7 on 27 and 7972 DF,  p-value: < 2.2e-16

# Fit Model B (with interaction)
model_B <- lm(
  menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

summary(model_B)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.337  -3.837  -1.604   0.628  30.426 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.89938    0.92857   7.430 1.20e-13
## physhlth_days                              0.26859    0.01007  26.679  < 2e-16
## bmi                                        0.03309    0.01323   2.502 0.012380
## sexFemale                                  1.18553    0.17752   6.678 2.58e-11
## smoker_currentCurrent smoker               1.52079    0.36539   4.162 3.19e-05
## exerciseYes                               -0.67285    0.21496  -3.130 0.001753
## age_group25-29                            -0.92018    0.51962  -1.771 0.076616
## age_group30-34                            -0.89242    0.50591  -1.764 0.077774
## age_group35-39                            -1.59287    0.48752  -3.267 0.001090
## age_group40-44                            -2.62864    0.48564  -5.413 6.39e-08
## age_group45-49                            -2.84255    0.49687  -5.721 1.10e-08
## age_group50-54                            -3.27784    0.48195  -6.801 1.11e-11
## age_group55-59                            -4.24987    0.47404  -8.965  < 2e-16
## age_group60-64                            -4.26404    0.45671  -9.336  < 2e-16
## age_group65-69                            -5.25062    0.44662 -11.756  < 2e-16
## age_group70-74                            -5.71106    0.45439 -12.569  < 2e-16
## age_group75-79                            -5.90758    0.47018 -12.565  < 2e-16
## age_group80+                              -6.49946    0.47359 -13.724  < 2e-16
## income$15,000 to < $25,000                -1.63574    0.46999  -3.480 0.000503
## income$25,000 to < $35,000                -2.07457    0.44862  -4.624 3.82e-06
## income$35,000 to < $50,000                -2.54551    0.43915  -5.796 7.03e-09
## income$50,000 to < $100,000               -3.04298    0.41157  -7.394 1.57e-13
## income$100,000 to < $200,000              -3.50972    0.42999  -8.162 3.79e-16
## income$200,000 or more                    -3.84047    0.50103  -7.665 2.00e-14
## educationHigh school diploma or GED        0.12556    0.81238   0.155 0.877177
## educationSome college or technical school  1.11789    0.69123   1.617 0.105865
## educationCollege graduate                  1.81788    0.69283   2.624 0.008711
## educationGraduate or professional degree   1.66905    0.69428   2.404 0.016240
## sexFemale:smoker_currentCurrent smoker     1.28327    0.51705   2.482 0.013089
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              ***
## exerciseYes                               ** 
## age_group25-29                            .  
## age_group30-34                            .  
## age_group35-39                            ** 
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to < $25,000                ***
## income$25,000 to < $35,000                ***
## income$35,000 to < $50,000                ***
## income$50,000 to < $100,000               ***
## income$100,000 to < $200,000              ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## sexFemale:smoker_currentCurrent smoker    *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared:  0.1826, Adjusted R-squared:  0.1798 
## F-statistic: 63.61 on 28 and 7971 DF,  p-value: < 2.2e-16

# Step 3: Test interaction with partial F-test
interaction_test <- anova(model_A, model_B)
interaction_test

## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   7972 432509                              
## 2   7971 432175  1    333.97 6.1598 0.01309 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Step 4: Extract and compute smoking effects by sex
coef(model_B)

##                               (Intercept) 
##                                6.89937969 
##                             physhlth_days 
##                                0.26858758 
##                                       bmi 
##                                0.03308926 
##                                 sexFemale 
##                                1.18553147 
##              smoker_currentCurrent smoker 
##                                1.52079072 
##                               exerciseYes 
##                               -0.67284825 
##                            age_group25-29 
##                               -0.92018465 
##                            age_group30-34 
##                               -0.89241633 
##                            age_group35-39 
##                               -1.59287470 
##                            age_group40-44 
##                               -2.62863606 
##                            age_group45-49 
##                               -2.84254506 
##                            age_group50-54 
##                               -3.27783622 
##                            age_group55-59 
##                               -4.24986875 
##                            age_group60-64 
##                               -4.26403799 
##                            age_group65-69 
##                               -5.25061692 
##                            age_group70-74 
##                               -5.71105572 
##                            age_group75-79 
##                               -5.90757854 
##                              age_group80+ 
##                               -6.49945710 
##                income$15,000 to < $25,000 
##                               -1.63574119 
##                income$25,000 to < $35,000 
##                               -2.07456733 
##                income$35,000 to < $50,000 
##                               -2.54550750 
##               income$50,000 to < $100,000 
##                               -3.04298017 
##              income$100,000 to < $200,000 
##                               -3.50972483 
##                    income$200,000 or more 
##                               -3.84047036 
##       educationHigh school diploma or GED 
##                                0.12555591 
## educationSome college or technical school 
##                                1.11789017 
##                 educationCollege graduate 
##                                1.81787943 
##  educationGraduate or professional degree 
##                                1.66905037 
##    sexFemale:smoker_currentCurrent smoker 
##                                1.28327305

effect_men <- coef(model_B)["smoker_currentCurrent smoker"]

effect_women <- coef(model_B)["smoker_currentCurrent smoker"] +
  coef(model_B)["sexFemale:smoker_currentCurrent smoker"]

effect_men

## smoker_currentCurrent smoker 
##                     1.520791

effect_women

## smoker_currentCurrent smoker 
##                     2.804064

# Optional: rounded values
round(effect_men, 3)

## smoker_currentCurrent smoker 
##                        1.521

round(effect_women, 3)

## smoker_currentCurrent smoker 
##                        2.804

# Step 5: Visualize the interaction
interaction_plot_data <- ggpredict(model_B, terms = c("smoker_current", "sex"))

plot(interaction_plot_data) +
  labs(
    x = "Smoking status",
    y = "Predicted mentally unhealthy days",
    title = "Predicted mentally unhealthy days by smoking status and sex",
    color = "Sex"
  )

Step 3 Interpretation:

Hypotheses

H₀: The interaction between sex and current smoking equals 0 (no effect modification). Hₐ: The interaction between sex and current smoking is not 0 (effect modification exists).

Results

From ANOVA table:

F(1, 7971) = 6.16 p = 0.0131 Conclusion

Since p = 0.013 < 0.05, we reject the null hypothesis

There is statistically significant evidence that the association between current smoking and mentally unhealthy days differs by sex.

Step 4 Interpretation:

Effect among men (reference group)

Coefficient:

smoker_currentCurrent smoker = 1.52079

Rounded:

Among men, being a current smoker is associated with 1.521 additional mentally unhealthy days compared with non-smokers, holding all other variables constant.

Effect among women

Smoking effect = 1.52079 Interaction = 1.28327

Total: 1.52079 + 1.28327 = 2.80406

Rounded:

Among women, being a current smoker is associated with 2.804 additional mentally unhealthy days compared with non-smokers, holding all other variables constant.

The association between current smoking and mentally unhealthy days is significantly stronger among women than men. While male smokers report approximately 1.5 additional mentally unhealthy days compared to non-smokers, female smokers report about 2.8 additional days. This difference of roughly 1.28 days indicates that smoking has a greater negative impact on mental health among women, suggesting meaningful effect modification by sex.

Step 6 Interpretation:

Current smoking is associated with worse mental health among U.S. adults, and this relationship differs between men and women. While both male and female smokers report more mentally unhealthy days than non-smokers, the increase is notably larger among women. This suggests that women may experience a greater mental health burden related to smoking or may be more vulnerable to its psychological effects. From a public health perspective, these findings highlight the importance of integrating mental health support into smoking cessation programs and suggest that interventions may need to be tailored by sex to be most effective.

Part 4: Reflection (15 points)

This analysis suggests that mental health among U.S. adults is shaped by a combination of physical health, behavioral factors, and socioeconomic conditions. The strongest predictor was physically unhealthy days, which showed a large and highly significant positive association with mentally unhealthy days, indicating that individuals experiencing more physical illness also report substantially worse mental health. Current smoking was also an important predictor, with a stronger association among women than men, highlighting meaningful effect modification by sex. Income and age were also strongly associated, with higher income levels and older age groups generally reporting fewer mentally unhealthy days. In contrast, some education categories (e.g., high school diploma or GED) and certain younger age groups were not statistically significant, suggesting weaker independent contributions after adjustment.

Several limitations should be considered. Because the data are cross-sectional, causal relationships cannot be established, and reverse causation is possible (e.g., poor mental health may influence smoking behavior). Additionally, all variables are self-reported, introducing potential recall bias and misclassification. Residual confounding is also a concern. For example, access to healthcare may influence both physical and mental health outcomes, and underlying mental health diagnoses (such as depression or anxiety disorders) could affect both smoking behavior and reported mentally unhealthy days. Other important confounders may include employment status and social support, which were not included in the model.

Adjusted R-squared provides a more informative measure of model fit than R-squared because it accounts for the number of predictors included. While R-squared will always increase when additional variables are added, Adjusted R-squared penalizes unnecessary complexity and helps determine whether added predictors meaningfully improve explanatory power. This distinction is especially important when including categorical variables like income and education, which introduce multiple parameters simultaneously.

Finally, chunk tests are useful because they evaluate the collective contribution of related predictors. Even if individual levels of a categorical variable are not statistically significant, the variable as a whole may still explain meaningful variation in the outcome. This provides a more complete understanding of how broader constructs, such as socioeconomic status, influence mental health beyond individual category comparisons.

End of Homework 3

EPI 553 — Homework 3: Regression, Confounding/Interaction, & Model Selection

Jessica Okoroji

Due: Thursday, April 5, 2026 at 11:59 PM ET

Part 0: Data Preparation (10 points)

Step 10 Interpretation:

Step 11 Interpretation:

Part 1: Multiple Linear Regression (25 Points)

Step 2 Interpretation:

Step 3 Interpretation:

Step 4 Interpretation:

Part 2: Tests of Hypotheses (20 Points)

Step 1 Interpretation: Type III (partial) sums of squares

Step 2 Interpretation: Chunk test for income

Step 3 Interpretation: Chunk test for education

Step 4 Interpretation:

Part 3: Interaction Analysis (25 Points)

Step 3 Interpretation:

Step 4 Interpretation:

Step 6 Interpretation:

Part 4: Reflection (15 points)