EPI 553 Homework 3: Multiple Linear Regression, Tests of Hypotheses, and Interaction Analysis

#Research Question: What individual and behavioral factors are associated with the number of mentally unhealthy days reported by U.S. adults, and does the association between current smoking and mental health differ by sex?

#Part 0: Data Acquisition and Preparation Download Packages

library (tidyverse)
library (kableExtra)
library (car)
library (broom)
library (haven)
library (dplyr)
library (janitor)
library (gtsummary)
library (ggeffects)

Import Data

brfss_2023_full <- read_xpt(
  "~/Documents/HEPI553/data/LLCP2023.XPT "
) |>
  clean_names()

Select 9 Variables; Recode

brfss_subset <- brfss_2023_full |>
  mutate(
    
    # Mental health days
    menthlth_days = case_when(
      menthlth == 88 ~ 0,
      menthlth %in% c(77, 99) ~ NA_real_,
      menthlth >= 1 & menthlth <= 30  ~ as.numeric(menthlth), 
      TRUE ~ NA_real_
    ),
    
    # Physical unhealth days
    physhlth_days = case_when(
      physhlth == 88 ~ 0,
      physhlth %in% c(77, 99) ~ NA_real_,
      physhlth >= 1 & physhlth <= 30  ~ as.numeric(physhlth),
      TRUE ~ NA_real_
    ),
    
    # BMI
    bmi = case_when(
      bmi5 == 9999 ~ NA_real_,
      TRUE ~ as.numeric(bmi5) / 100
    ),
    
    # Sex
    sex = factor(case_when(
      sexvar == 1 ~ "Male",
      sexvar == 2 ~ "Female",
      TRUE ~ NA_character_
    ),
    levels = c("Male", "Female")),
    
    # Exercise
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      exerany2 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),
    levels = c("No","Yes")),
    
    # Age group
    age_group = factor(case_when(
      ageg5yr == 1 ~ "18-24",
      ageg5yr == 2 ~ "25-29",
      ageg5yr == 3 ~ "30-34",
      ageg5yr == 4 ~ "35-39",
      ageg5yr == 5 ~ "40-44",
      ageg5yr == 6 ~ "45-49",
      ageg5yr == 7 ~ "50-54",
      ageg5yr == 8 ~ "55-59",
      ageg5yr == 9 ~ "60-64",
      ageg5yr == 10 ~ "65-69",
      ageg5yr == 11 ~ "70-74",
      ageg5yr == 12 ~ "75-79",
      ageg5yr == 13 ~ "80+",
      ageg5yr == 14 ~ NA_character_, 
      TRUE ~ NA_character_
    ),
    levels=c("18-24","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80+")),
    
    # Income
    income = factor(case_when(
      incomg1 == 1 ~ "Less than $15,000",
      incomg1 == 2 ~ "$15,000 to $24,999",
      incomg1 == 3 ~ "$25,000 to $34,999",
      incomg1 == 4 ~ "$35,000 to $49,999",
      incomg1 == 5 ~ "$50,000 to $99,999",
      incomg1 == 6 ~ "$100,000 to $199,999",
      incomg1 == 7 ~ "$200,000 or more",
      incomg1 == 9 ~ NA_character_, 
      TRUE ~ NA_character_
    ),
    levels = c("Less than $15,000","$15,000 to $24,999","$25,000 to $34,999","$35,000 to $49,999","$50,000 to $99,999","$100,000 to $199,999","$200,000 or more")),
    
    # Education
    education = factor(case_when(
      educa %in% c(1, 2) ~ "Less than high school",
      educa == 3 ~ "High school diploma or GED",
      educa == 4 ~ "Some college or technical school",
      educa == 5 ~ "College graduate",
      educa == 6 ~ "Graduate or professional degree",
      educa == 9 ~ NA_character_, 
      TRUE ~ NA_character_
    ), 
     levels = c("Less than high school","High school diploma or GED","Some college or technical school","College graduate","Graduate or professional degree")),
    
    # Smoking
    smoking = factor(case_when(
      smoker3 == 1 ~ "Current daily smoker",
      smoker3 == 2 ~ "Current some-day smoker",
      smoker3 == 3 ~ "Former smoker",
      smoker3 == 4 ~ "Never smoker",
      smoker3 == 9 ~ NA_character_
    ), 
    levels = c("Current daily smoker","Current some-day smoker","Former smoker","Never smoker"))
    
  ) |> 
  
select(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking)

Report Missing Observations

cat("Missing values in analytic dataset:", sum(!complete.cases(brfss_subset)), "\n")

## Missing values in analytic dataset: 122555

cat("All models are fit on the same n =", nrow(brfss_subset), "observations.\n")

## All models are fit on the same n = 433323 observations.

cat("Missing menthlth_days:", sum(is.na(brfss_subset$menthlth_days)), 
    "(", round(mean(is.na(brfss_subset$menthlth_days)) * 100, 1), "%)\n")

## Missing menthlth_days: 8108 ( 1.9 %)

cat("Missing physhlth_days:", sum(is.na(brfss_subset$physhlth_days)), 
    "(", round(mean(is.na(brfss_subset$physhlth_days)) * 100, 1), "%)\n")

## Missing physhlth_days: 10785 ( 2.5 %)

cat("Missing income:", sum(is.na(brfss_subset$income)), 
    "(", round(mean(is.na(brfss_subset$income)) * 100, 1), "%)\n")

## Missing income: 86623 ( 20 %)

Removing Observations With Missing Values

set.seed(1220)
brfss_analytic <- brfss_subset |>
 drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
 age_group, income, education, smoking) |>
 slice_sample(n = 8000)

Descriptive Statistics Table, Stratified by Sex

brfss_analytic |>
  select(menthlth_days, physhlth_days, age_group, sex, education, exercise, smoking, bmi, income) |>
  tbl_summary(
    by = sex,
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      age_group     ~ "Age (years)",
      education     ~ "Education level",
      exercise      ~ "Any physical activity (past 30 days)",
      smoking       ~ "Smoker Status",
      bmi           ~ "BMI Status",
      income ~ "Income Status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Statified by Sex**") |>
  as_gt()

**Table 1. Descriptive Statistics — BRFSS 2023 Statified by Sex**
Characteristic	N	Male N = 3,936¹	Female N = 4,064¹
Mentally unhealthy days (past 30)	8,000	3.5 (7.5)	5.1 (8.6)
Physically unhealthy days (past 30)	8,000	4.0 (8.4)	4.9 (8.9)
Age (years)	8,000
18-24		235 (6.0%)	171 (4.2%)
25-29		219 (5.6%)	189 (4.7%)
30-34		253 (6.4%)	210 (5.2%)
35-39		263 (6.7%)	302 (7.4%)
40-44		290 (7.4%)	292 (7.2%)
45-49		266 (6.8%)	252 (6.2%)
50-54		305 (7.7%)	303 (7.5%)
55-59		308 (7.8%)	352 (8.7%)
60-64		408 (10%)	379 (9.3%)
65-69		418 (11%)	483 (12%)
70-74		382 (9.7%)	426 (10%)
75-79		325 (8.3%)	338 (8.3%)
80+		264 (6.7%)	367 (9.0%)
Education level	8,000
Less than high school		75 (1.9%)	49 (1.2%)
High school diploma or GED		130 (3.3%)	122 (3.0%)
Some college or technical school		950 (24%)	877 (22%)
College graduate		1,018 (26%)	1,120 (28%)
Graduate or professional degree		1,763 (45%)	1,896 (47%)
Any physical activity (past 30 days)	8,000	3,146 (80%)	3,094 (76%)
Smoker Status	8,000
Current daily smoker		339 (8.6%)	319 (7.8%)
Current some-day smoker		151 (3.8%)	117 (2.9%)
Former smoker		1,207 (31%)	1,055 (26%)
Never smoker		2,239 (57%)	2,573 (63%)
BMI Status	8,000	28.7 (6.0)	28.7 (7.0)
Income Status	8,000
Less than $15,000		160 (4.1%)	247 (6.1%)
$15,000 to $24,999		271 (6.9%)	370 (9.1%)
$25,000 to $34,999		376 (9.6%)	495 (12%)
$35,000 to $49,999		482 (12%)	585 (14%)
$50,000 to $99,999		1,251 (32%)	1,260 (31%)
$100,000 to $199,999		996 (25%)	869 (21%)
$200,000 or more		400 (10%)	238 (5.9%)
¹ Mean (SD); n (%)

#Part 1: Multiple Linear Regression Research Question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?

Step 1: Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output.

#MLR Model
m1 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking, data = brfss_analytic)

#Summary of MLR Model
summary(m1)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                9.65053    0.95407  10.115  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to $24,999                  -1.63703    0.46899  -3.491 0.000485
## income$25,000 to $34,999                  -2.07637    0.44797  -4.635 3.63e-06
## income$35,000 to $49,999                  -2.54685    0.43819  -5.812 6.40e-09
## income$50,000 to $99,999                  -3.05048    0.41069  -7.428 1.22e-13
## income$100,000 to $199,999                -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school diploma or GED        0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to $24,999                  ***
## income$25,000 to $34,999                  ***
## income$35,000 to $49,999                  ***
## income$50,000 to $99,999                  ***
## income$100,000 to $199,999                ***
## income$200,000 or more                    ***
## educationHigh school diploma or GED          
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

tidy(m1, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Naive Model",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Naive Model
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	9.6505	0.9541	10.1151	0.0000	7.7803	11.5208
physhlth_days	0.2656	0.0101	26.3841	0.0000	0.2459	0.2853
bmi	0.0334	0.0132	2.5274	0.0115	0.0075	0.0593
sexFemale	1.3904	0.1675	8.3007	0.0000	1.0620	1.7187
exerciseYes	-0.6512	0.2147	-3.0326	0.0024	-1.0721	-0.2303
age_group25-29	-1.0566	0.5194	-2.0343	0.0420	-2.0747	-0.0385
age_group30-34	-1.0939	0.5065	-2.1600	0.0308	-2.0867	-0.1012
age_group35-39	-1.8110	0.4885	-3.7072	0.0002	-2.7686	-0.8534
age_group40-44	-2.8931	0.4875	-5.9346	0.0000	-3.8487	-1.9375
age_group45-49	-3.0562	0.4977	-6.1408	0.0000	-4.0318	-2.0806
age_group50-54	-3.5167	0.4832	-7.2775	0.0000	-4.4640	-2.5695
age_group55-59	-4.4960	0.4755	-9.4543	0.0000	-5.4282	-3.5638
age_group60-64	-4.5221	0.4585	-9.8633	0.0000	-5.4209	-3.6234
age_group65-69	-5.5779	0.4502	-12.3903	0.0000	-6.4604	-4.6955
age_group70-74	-6.0254	0.4574	-13.1728	0.0000	-6.9220	-5.1287
age_group75-79	-6.2866	0.4748	-13.2392	0.0000	-7.2174	-5.3557
age_group80+	-6.8197	0.4768	-14.3019	0.0000	-7.7544	-5.8850
income$15,000 to $24,999	-1.6370	0.4690	-3.4905	0.0005	-2.5564	-0.7177
income$25,000 to $34,999	-2.0764	0.4480	-4.6351	0.0000	-2.9545	-1.1982
income$35,000 to $49,999	-2.5469	0.4382	-5.8122	0.0000	-3.4058	-1.6879
income$50,000 to $99,999	-3.0505	0.4107	-7.4277	0.0000	-3.8555	-2.2454
income$100,000 to $199,999	-3.4998	0.4292	-8.1537	0.0000	-4.3413	-2.6584
income$200,000 or more	-3.7841	0.5004	-7.5628	0.0000	-4.7649	-2.8033
educationHigh school diploma or GED	0.0799	0.8107	0.0986	0.9215	-1.5092	1.6690
educationSome college or technical school	1.0768	0.6897	1.5612	0.1185	-0.2753	2.4288
educationCollege graduate	1.7909	0.6912	2.5911	0.0096	0.4360	3.1458
educationGraduate or professional degree	1.7378	0.6925	2.5095	0.0121	0.3803	3.0953
smokingCurrent some-day smoker	-1.5867	0.5352	-2.9645	0.0030	-2.6359	-0.5375
smokingFormer smoker	-1.9897	0.3371	-5.9020	0.0000	-2.6506	-1.3289
smokingNever smoker	-2.9368	0.3216	-9.1313	0.0000	-3.5673	-2.3063

Step 2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places.

Mental Unhealthy Days = 9.651 + 0.267 (physthlth_days) + 0.033 (bmi) + 1.390 (sex) - 0.651 (exercise) - 1.057 (age_group25-29) - 1.094 (age_group30-34) - 1.811 (age_group35-39) - 2.893 (age_group40-44) - 0.356 (age_group45-49) - 3.517 (age_group50-54) - 4.496 (age_group55-59) - 4.522 (age_group60-64) -5.578 (age_group65-69) - 6.025 (age_group70-74) - 6.287 (age_group75-79) - 6.820 (age_group80+) - 1.637 (income 15,000-24,999) - 2.076 ( income 25,000-34,999) - 2.547 (income 35,000-49,999) - 3.051 (income 50,000-99,999) - 3.500 (income 100,000-199,999) - 3.784 (income 200,000 or more) + 0.080 (educationHigh school diploma or GED) + 1.077 (educationSome college or technical school) + 1.791 (educationCollege graduate) + 1.738 (educationGraduate or professional degree) - 1.587 (smokingCurrent some-day smoker) - 1.990 (smokingFormer smoker) - 2.937 (smokingNever smoker)

Step 3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant

Physhlth_days (continuous predictor) = Each additional day of poor physical health is associated with an estimated 0.267 additional mentally unhealthy day on average, holding bmi, sex, exercise,age and income constant.

Bmi (continuous predictor) = Each one-unit increase in the BMI category is associated with 0.033 fewer mentally unhealthy days on average, holding all other variables constant.

Sex = Female vs. Male (reference) = Compared to males (the reference group), females report an estimated 1.390 more mentally unhealthy days on average, holding all other variables constant.

Exercise = Yes vs. No (reference) = People who engaged in any physical activity in the past 30 days report an estimated - 0.651 fewer mentally unhealthy days compared to those who did not exercise, adjusting for all other covariates, holding all of the other variables constant.

Age_group60-64 = Each additional year of age is associated with - 4.522 fewer mentally unhealthy days on average, holding all else constant. This counter intuitive finding is well-documented — older adults often report fewer mental health difficulties, possibly due to better emotion regulation, survivor bias, or cohort effects.

Income15,000 -24,999 = Each one-unit increase in the income category (15,000 -24,999) is associated with - 1.637 fewer mentally unhealthy days on average, consistent with the well-established socioeconomic gradient in mental health, holding all other variables constant.

Income25,000 - 34,999 = Each one-unit increase in the income category (25,000 - 34,999) is associated with - 2.076 fewer mentally unhealthy days on average, consistent with the well-established socioeconomic gradient in mental health, holding all other variables constant.

Intercept = The estimated mean mental health days for a person with zero physically unhealthy days, zero bmi, sex, zero exercise, age group 60-64, smoking status, income 15,000-24,999 and income 25,000-34,999. This is a mathematical artifact — not a meaningful quantity in context.

Step 4: Report and interpret each of the following model-level statistics:

glance(m1) %>%
  select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual, nobs) %>%
  pivot_longer(everything(), names_to = "Metric", values_to = "Value") %>%
  mutate(
    Value = round(Value, 4),
    Metric = dplyr::recode(Metric,
      "r.squared"     = "R²",
      "adj.r.squared" = "Adjusted R²",
      "sigma"         = "Root MSE (σ̂)",
      "statistic"     = "F-statistic (overall)",
      "p.value"       = "p-value (overall F-test)",
      "df"            = "Model df (p)",
      "df.residual"   = "Residual df (n − p − 1)",
      "nobs"          = "n (observations)"
    )
  ) %>%
  kable(caption = "Table 2. Overall Model Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 2. Overall Model Summary
Metric	Value
R²	0.1853
Adjusted R²	0.1824
Root MSE (σ̂)	7.351

• R-squared: proportion of variance in mentally unhealthy days explained by all predictors

R^2: 0.1853 There is a 19% variance in mentally unhealthy days explained by all predictors.

• Adjusted R-squared: how it differs from R-squared and what it adds

R^2 adjusted: 0.1824 The R-squared is slightly higher compared to the adjusted, conveying certain predictors were penalized.

• Root MSE (residual standard error): average prediction error of the model

Root MSE: 7.351 There is a 7.4% standard error for the average prediction of the model.

• Overall F-test:

H0: Mental unhealthy days are not associated with all of the predictors H1: Mental unhealthy days are associated with all of the predictors

F-statistic: 62.52 Degrees of freedom (numerator and denominator): 29 P-value: 0

Conclusion: Since the p-value is less than 0.05, we reject the null hypothesis. It is statistically significant and there is an association between mental unhealthy days and all the predictors (physical unhealthy days, bmi, age, sex, income, exercise and education)

#Part 2: Test of Hypotheses

Step 1: Obtain Type III (partial) sums of squares for all predictors using car::Anova(model, type = “III”). Display the full output. Identify which predictors have statistically significant partial associations at α = 0.05.

anova_3 <- Anova(m1, type = "III")

anova_3 |>
  tidy() |>
  mutate(
    across(where(is.numeric), \(x) round(x, 4)),
    `Significant (alpha = 0.05)` = ifelse(p.value < 0.05, "Yes", "No")
  ) |>
  kable(
    caption = "Table 3: Type III Partial Sum of Squares - Testing All Predictor's Contribution",
    col.names = c("Predictor","Sum of Sq","df","F value","p-value","Significant?")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  column_spec(6, bold = TRUE)

Table 3: Type III Partial Sum of Squares - Testing All Predictor’s Contribution
Predictor	Sum of Sq	df	F value	p-value	Significant?
(Intercept)	5529.7722	1	102.3152	0.0000	Yes
physhlth_days	37622.7952	1	696.1198	0.0000	Yes
bmi	345.2408	1	6.3879	0.0115	Yes
sex	3723.8662	1	68.9012	0.0000	Yes
exercise	497.0434	1	9.1966	0.0024	Yes
age_group	30092.1774	12	46.3986	0.0000	Yes
income	4733.8943	6	14.5982	0.0000	Yes
education	1265.1504	4	5.8521	0.0001	Yes
smoking	5101.1076	3	31.4613	0.0000	Yes
Residuals	430750.0872	7970	NA	NA	NA

Interpretation: The predictors that are statistically significant are physhlth_days, bmi, sex, exercise, age_group, income, education and smoking.

Step 2 (Chunk test for income): Test whether income collectively improves the model significantly, after accounting for all other predictors:

# Reduced model: only income
m_reduced <- lm(menthlth_days ~ physhlth_days + bmi + exercise + age_group + education +smoking, data = brfss_analytic)

# Full model: income + demographics
m_full <- lm(menthlth_days ~ physhlth_days + income + bmi + exercise + age_group + education +smoking, 
             data = brfss_analytic)

# Chunk test
anova(m_reduced, m_full) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (all predictors except income)", "Full (+ income)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 4. Chunk Test: Do demographic variables collectively add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 4. Chunk Test: Do demographic variables collectively add to the model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced (all predictors except income)	7977	440345.3	NA	NA	NA	NA
Full (+ income)	7971	434474.0	6	5871.359	17.953	0

H0: Income is not a key predictor for the model H1: Income is a key predictor for the model

F-statistic: 17.953 Degrees of freedom: 6 P-value: 0

State your conclusion at α = 0.05: Since the p-value is less than 0.05, we reject the null hypothesis. It is statistically significant and there is an association between income and the model.

Step 3 (Chunk test for education): Repeat the same procedure for education as a group of predictors.

# Reduced model: only education
m_reduced <- lm(menthlth_days ~ physhlth_days + bmi + exercise + age_group + income +smoking, data = brfss_analytic)

# Full model: education + demographics
m_full <- lm(menthlth_days ~ physhlth_days + income + bmi + exercise + age_group + education +smoking, 
             data = brfss_analytic)

# Chunk test
anova(m_reduced, m_full) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (all predictors except education)", "Full (+ education)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 5. Chunk Test: Do demographic variables collectively add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 5. Chunk Test: Do demographic variables collectively add to the model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced (all predictors except education)	7975	436108	NA	NA	NA	NA
Full (+ education)	7971	434474	4	1634.012	7.4945	0

H0: Education is not a key predictor for the model H1: Education is a key predictor for the model

F-statistic: 7.4945 Degrees of freedom:4 P-value: 0

State your conclusion at α = 0.05: Since the p-value is less than 0.05, we reject the null hypothesis. It is statistically significant and there is an association between education and the model.

Step 4: Write a 3 to 5 sentence synthesis interpreting the Type III results and your chunk test findings. Which predictors make the strongest independent contributions? What do the chunk tests add beyond the individual t-tests from summary()?

For the Type III, physhlth_days, income, bmi, exercise, age_group, education, and smoking are significant. This means the predictors adds to the prediction of mental unhealthy days. By using the sum of squares Type III, we are able to determine if a certain variable contributes to the model given all other variables. With the chunk test, we are able to determine that both predictors income and education are statistically significant since the p-value is less than 0.05 after adjusting for the other variables. The predictor with the strongest independent contributions are physical unhealthy days. The chunk tests add beyond the individual t-tests by conveying the significance from the group of variables rather than the coefficients.

#Part 3: Interaction Anlysis Background: Effect modification occurs when the association between a predictor and an outcome differs across levels of another variable. You will test whether the association between current smoking and mentally unhealthy days differs between men and women.

Step 1: Create a binary smoking variable called smoker_current: • “Current smoker” includes current daily smokers and current some-day smokers. • “Non-smoker” includes former smokers and never smokers. Use “Non-smoker” as the reference category.

brfss_analytic <- brfss_analytic %>%
  
  mutate(
    smoker_current = case_when(
      smoking %in% c("Current some-day smoker", "Current daily smoker") ~ "Current smoker",
        smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
      TRUE ~ NA_character_
    ),
    
    smoker_current = factor(smoker_current, 
                            levels = c("Non-smoker", "Current smoker"))
    )

Step 2: Fit two models: • Model A (main effects only): regress menthlth_days on physhlth_days, bmi, sex, smoker_current, exercise, age_group, income, and education. • Model B (with interaction): same as Model A, but use sex * smoker_current in the formula to include the interaction term.

# The maximum model with all candidate predictors
mod_a <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education,
              data = brfss_analytic)

tidy(mod_a, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Model A: Main Effects Only",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model A: Main Effects Only
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	6.7553	0.9271	7.2869	0.0000	4.9381	8.5726
physhlth_days	0.2686	0.0101	26.6731	0.0000	0.2489	0.2884
bmi	0.0334	0.0132	2.5250	0.0116	0.0075	0.0593
sexFemale	1.3331	0.1673	7.9675	0.0000	1.0051	1.6611
smoker_currentCurrent smoker	2.1287	0.2712	7.8489	0.0000	1.5971	2.6604
exerciseYes	-0.6725	0.2150	-3.1274	0.0018	-1.0940	-0.2510
age_group25-29	-0.9149	0.5198	-1.7602	0.0784	-1.9338	0.1040
age_group30-34	-0.8823	0.5061	-1.7434	0.0813	-1.8743	0.1097
age_group35-39	-1.5810	0.4877	-3.2421	0.0012	-2.5369	-0.6251
age_group40-44	-2.6157	0.4858	-5.3847	0.0000	-3.5680	-1.6635
age_group45-49	-2.8246	0.4970	-5.6836	0.0000	-3.7988	-1.8504
age_group50-54	-3.2600	0.4821	-6.7628	0.0000	-4.2050	-2.3151
age_group55-59	-4.2301	0.4741	-8.9219	0.0000	-5.1595	-3.3007
age_group60-64	-4.2484	0.4568	-9.3000	0.0000	-5.1439	-3.3529
age_group65-69	-5.2338	0.4467	-11.7163	0.0000	-6.1095	-4.3582
age_group70-74	-5.7023	0.4545	-12.5457	0.0000	-6.5933	-4.8113
age_group75-79	-5.8977	0.4703	-12.5401	0.0000	-6.8197	-4.9758
age_group80+	-6.4888	0.4737	-13.6975	0.0000	-7.4174	-5.5602
income$15,000 to $24,999	-1.6797	0.4698	-3.5754	0.0004	-2.6007	-0.7588
income$25,000 to $34,999	-2.1023	0.4486	-4.6861	0.0000	-2.9817	-1.2229
income$35,000 to $49,999	-2.5869	0.4390	-5.8931	0.0000	-3.4474	-1.7264
income$50,000 to $99,999	-3.0823	0.4114	-7.4921	0.0000	-3.8887	-2.2758
income$100,000 to $199,999	-3.5360	0.4300	-8.2232	0.0000	-4.3789	-2.6931
income$200,000 or more	-3.8625	0.5011	-7.7078	0.0000	-4.8448	-2.8802
educationHigh school diploma or GED	0.2139	0.8119	0.2634	0.7922	-1.3776	1.8053
educationSome college or technical school	1.1965	0.6907	1.7323	0.0833	-0.1575	2.5505
educationCollege graduate	1.9035	0.6922	2.7499	0.0060	0.5466	3.2604
educationGraduate or professional degree	1.7456	0.6938	2.5160	0.0119	0.3856	3.1057

# Compare a small model to the maximum model
mod_b <- lm(menthlth_days ~ physhlth_days + bmi + exercise + age_group + income + education + sex * smoker_current, data = brfss_analytic)

anova(mod_a, mod_b) |>
  tidy() |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(caption = "Model B: With Interaction ") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model B: With Interaction
term	df.residual	rss	df	sumsq	statistic	p.value
menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education	7972	432508.9	NA	NA	NA	NA
menthlth_days ~ physhlth_days + bmi + exercise + age_group + income + education + sex * smoker_current	7971	432174.9	1	333.9749	6.1598	0.0131

Step 3: Test whether the interaction is statistically significant:

H0: Predictor sex stratified by smoker_current is not associated with mental unhealthy days H1: Predictor sex stratified by smoker_current is associated with mental unhealthy days

F-statistic: 6.1598 Degrees of freedom: 1 P-value:.0131

State your conclusion at α = 0.05: Since the p-value is less than 0.05, we reject the null hypothesis. It is statistically significant, there is an association between the predictor sex stratified by current smoker and the outcome, mental unhealthy days.

Step 4: Interpret the interaction using the coefficients from Model B:

4a:Report the estimated effect of current smoking on mentally unhealthy days among men (the coefficient for smoker_current). Interpret in plain language.

The estimated effect of current smoking among men have a 2.129 times higher chance of developing mentally unhealthy days compared to women. This signifies that men have a statistically significant change of developing the outcome.

4b:Compute and report the estimated effect among women (smoker_current coefficient plus the interaction coefficient). Interpret in plain language.

The estimated effect of current smoking among women have a 1.333 times lower chance of developing mentally unhealthy days compared to men. This signifies that women do not have a statistically significant change of developing the outcome.

4c: In 2 to 3 sentences, explain what these estimates mean together: is smoking more strongly associated with poor mental health in one sex than the other, and by how much?

When comparing both model A and model B, the estimates reveal that smoking more is strongly associated with poor mental health days in males compared to females. As both have p-values less than 0.05, the estimate for the coefficient for smoker_current which is for males have a higher statistic compared to the smoker_current plus interaction effect which is female. This conveys that one sex has a stronger association between the predictor and outcome.

Step 5: Visualize the interaction using ggeffects::ggpredict(model_B, terms = c(“smoker_current”, “sex”)). The plot should show predicted mentally unhealthy days for each combination of smoking status and sex, with labeled axes, a descriptive title, and a legend identifying men and women.

pred_int <- ggpredict(mod_b, terms = c("smoker_current", "sex"))

ggplot(pred_int, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_line(linewidth = 1.2) +
  geom_point (size = 2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, color = NA) +
  
  labs(
    title    = "Predicted Mental Health Days by Smoking and Sex",
    subtitle = "From interaction model: smoker_current * sex",
    x        = "smoker_current",
    y        = "Mental Unhealthy Days",
    color    = "Sex",
    fill     = "Sex"
  ) +
  theme_minimal(base_size = 13) +
  scale_color_brewer(palette = "Set1", labels = c("Men", "Women")) +
  scale_fill_brewer(palette = "Set1", labels = c("Men", "Women"))

Step 6: Write 3 to 5 sentences interpreting the interaction for a general public health audience. Do not include coefficient values or p-values. Focus on the substantive finding and its implications.

When predicting mentally unhealthy days by current smoking status and sex, men who are non-smokers have a lower chance of developing higher mentally unhealthy days compared to women who are non-smokers. Women who are current smokers have a higher chance of developing more mentally unhealthy days compared to men who are current smokers. The data indicates that when analyzing smoking status stratified by sex, women have a higher chance of developing the outcome, mental unhealthy days compared to men.

#Part 4: Reflection

A. Findings and Limitations After analyzing the results, there are certain predictors that give adults a higher chance of developing mentally unhealthy days in the United States. For example, physical unhealthy days, sex, exercise, age_group, income, certain education status and smoking status convey significant results for developing the outcome. It is also important to recognize that focusing on income, education and current smoking status, they show a higher chance of mentally unhealthy days compared to the other predictors used. The bmi, education High school diploma or GED and education Some college or technical school did not have significant results. A key limitation for using cross-sectional survey data is certain biases, such as selection bias. As we have used these specific predictors, there might be other variables that do not represent the whole population. If we misrepresent the population, this can lead to inaccurate results. Another limitation to using this data is the challenge of determining causative relationships. Since this data was used from the BRFSS 2023, it is using a single time period. This might be harder to identify which predictors are correlated with each other for the overall outcome. Two specific confounders that could bias the associations I estimated could be specific health conditions and environmental factors that can disrupt mentally unhealthy days.

Source: https://www.read.enago.com/blog/what-is-a-cross-sectional-study-advantages-disadvantages-and-examples/

B. From Simple to Multiple Regression The R-squared tells us the proportion of total variance in Y when analyzing the model while the adjusted R-squared penalizes the number of predictors p. As shown the adjusted r-squared value is slightly less than the original r-squared value. The adjustment allows the individual to see how other predictors can affect the overall outcome by taking out certain variables. This is important when adding multiple categorical predictors to provide accurate results while comparing both statistics. It is important to test groups of predictors collectively with chunk tests because the chunk tests convey the significance from the group of variables rather than the coefficients themselves.

C. AI Transparency

Chunk Tests: I had a difficult time reading how to code the question. I used my reduce mode, full model and chunk test code from the lecture slides but I had a hard time putting in the right variables. Chat informed me that I should take out income for the reduced model but leave income in for the full model.

Binary Smoking Variable: I started with writing out the code for a binary variable but I had a hard time trying to figure out how to call them current and non smokers. I typed out the parts of the code chat provided into my original code to give me my results.

Two Models: After using my code from a previous lecture, I could not understand why my degrees of freedom were -1. I asked chat and it informed me I just needed to switch my model from mod_b, mod_a to mod_a, mod_b for my interaction model.

Visual Model: I used a code from the previous lecture but when I ran it, it did not show my plotted points. I asked chat what was wrong. It informed me I needed to add the geom_point line and the scale_color_brewer / scale_fill_brewers lines. So I added those new lines to my original code.

EPI 553 Homework 3: Multiple Linear Regression, Tests of Hypotheses, and Interaction Analysis

June McCarthy

2026-04-02