title: ” Multiple Linear Regression, Tests of Hypotheses, and
Interaction Analysis” author: “Ummat Safwat Sristy” date:
“4/5/2026” output: html_document: toc: true toc_float: true
theme: flatly number_sections: true
library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)
# Load the full XPT file
brfss_raw <- read_xpt("C:\\Users\\safwa\\OneDrive - University at Albany - SUNY\\EPI 553\\Labs\\LLCP2023.XPT")
# After first load, save as RDS for faster future loading:
saveRDS(brfss_raw, "brfss_hw3_raw.rds")
# In subsequent sessions, load with:
brfss_raw <- readRDS("brfss_hw3_raw.rds")
# Report dimensions
cat("Rows:", nrow(brfss_raw), "\n")
## Rows: 433323
cat("Columns:", ncol(brfss_raw), "\n")
## Columns: 350
brfss_selected <- brfss_raw |>
select(
MENTHLTH,
PHYSHLTH,
`_BMI5`,
SEXVAR,
EXERANY2,
`_AGEG5YR`,
`_INCOMG1`,
EDUCA,
`_SMOKER3`
)
cat("C:\\Users\\safwa\\OneDrive - University at Albany - SUNY\\EPI 553\\Labs\\LLCP2023.XPT")
## C:\Users\safwa\OneDrive - University at Albany - SUNY\EPI 553\Labs\LLCP2023.XPT
brfss_clean <- brfss_selected |>
mutate(
# MENTHLTH: 88 = None (recode to 0), 77/99 = NA, 1-30 keep as-is
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH %in% c(77, 99) ~ NA_real_,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_
),
# PHYSHLTH: same rules as MENTHLTH
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH %in% c(77, 99) ~ NA_real_,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_
),
# BMI: divide by 100, 9999 = NA
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_,
TRUE ~ as.numeric(`_BMI5`) / 100
),
# Sex: 1 = Male, 2 = Female
sex = factor(
case_when(
SEXVAR == 1 ~ "Male",
SEXVAR == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
),
# Exercise: 1 = Yes, 2 = No, 7/9 = NA
exercise = factor(
case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No",
EXERANY2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Yes", "No")
),
# Age group: 13 levels, 14 = NA
age_group = factor(
case_when(
`_AGEG5YR` == 1 ~ "18-24",
`_AGEG5YR` == 2 ~ "25-29",
`_AGEG5YR` == 3 ~ "30-34",
`_AGEG5YR` == 4 ~ "35-39",
`_AGEG5YR` == 5 ~ "40-44",
`_AGEG5YR` == 6 ~ "45-49",
`_AGEG5YR` == 7 ~ "50-54",
`_AGEG5YR` == 8 ~ "55-59",
`_AGEG5YR` == 9 ~ "60-64",
`_AGEG5YR` == 10 ~ "65-69",
`_AGEG5YR` == 11 ~ "70-74",
`_AGEG5YR` == 12 ~ "75-79",
`_AGEG5YR` == 13 ~ "80+",
`_AGEG5YR` == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("18-24","25-29","30-34","35-39","40-44",
"45-49","50-54","55-59","60-64","65-69",
"70-74","75-79","80+")
),
# Income: 7 levels, 9 = NA
income = factor(
case_when(
`_INCOMG1` == 1 ~ "Less than $15,000",
`_INCOMG1` == 2 ~ "$15,000 to $24,999",
`_INCOMG1` == 3 ~ "$25,000 to $34,999",
`_INCOMG1` == 4 ~ "$35,000 to $49,999",
`_INCOMG1` == 5 ~ "$50,000 to $99,999",
`_INCOMG1` == 6 ~ "$100,000 to $199,999",
`_INCOMG1` == 7 ~ "$200,000 or more",
`_INCOMG1` == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $15,000","$15,000 to $24,999",
"$25,000 to $34,999","$35,000 to $49,999",
"$50,000 to $99,999","$100,000 to $199,999",
"$200,000 or more")
),
# Education: 6 levels (1-2 collapsed), 9 = NA
education = factor(
case_when(
EDUCA %in% c(1, 2) ~ "Less than high school",
EDUCA == 3 ~ "High school diploma or GED",
EDUCA == 4 ~ "Some college or technical school",
EDUCA == 5 ~ "College graduate",
EDUCA == 6 ~ "Graduate or professional degree",
EDUCA == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than high school",
"High school diploma or GED",
"Some college or technical school",
"College graduate",
"Graduate or professional degree")
),
# Smoking: 4 levels, 9 = NA
smoking = factor(
case_when(
`_SMOKER3` == 1 ~ "Current daily smoker",
`_SMOKER3` == 2 ~ "Current some-day smoker",
`_SMOKER3` == 3 ~ "Former smoker",
`_SMOKER3` == 4 ~ "Never smoker",
`_SMOKER3` == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Current daily smoker","Current some-day smoker",
"Former smoker","Never smoker")
)
) |>
select(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking)
miss_summary <- brfss_clean |>
summarise(
across(everything(), ~ sum(is.na(.x))),
) |>
pivot_longer(everything(), names_to = "Variable", values_to = "N_Missing") |>
mutate(
N_Total = nrow(brfss_clean),
Pct_Missing = round(N_Missing / N_Total * 100, 2)
)
miss_summary |>
kable(caption = "Missing Values Before Exclusions") |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Variable | N_Missing | N_Total | Pct_Missing |
|---|---|---|---|
| menthlth_days | 8108 | 433323 | 1.87 |
| physhlth_days | 10785 | 433323 | 2.49 |
| bmi | 40535 | 433323 | 9.35 |
| sex | 0 | 433323 | 0.00 |
| exercise | 1251 | 433323 | 0.29 |
| age_group | 7779 | 433323 | 1.80 |
| income | 86623 | 433323 | 19.99 |
| education | 2325 | 433323 | 0.54 |
| smoking | 23062 | 433323 | 5.32 |
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
cat("Final analytic sample n =", nrow(brfss_analytic), "\n")
## Final analytic sample n = 8000
brfss_analytic |>
tbl_summary(
by = sex,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 2,
label = list(
menthlth_days ~ "Mentally unhealthy days",
physhlth_days ~ "Physically unhealthy days",
bmi ~ "BMI",
exercise ~ "Any exercise in past 30 days",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Education",
smoking ~ "Smoking status"
)
) |>
add_overall() |>
add_p() |>
bold_labels() |>
modify_caption("**Table 1. Descriptive Statistics by Sex, BRFSS 2023 Analytic Sample (n = 8,000)**")
| Characteristic | Overall N = 8,0001 |
Male N = 3,9361 |
Female N = 4,0641 |
p-value2 |
|---|---|---|---|---|
| Mentally unhealthy days | 4.32 (8.13) | 3.53 (7.53) | 5.08 (8.60) | <0.001 |
| Physically unhealthy days | 4.43 (8.69) | 3.98 (8.41) | 4.86 (8.94) | <0.001 |
| BMI | 28.70 (6.50) | 28.71 (5.97) | 28.69 (6.98) | 0.008 |
| Any exercise in past 30 days | 6,240 (78%) | 3,146 (80%) | 3,094 (76%) | <0.001 |
| Age group | <0.001 | |||
| 18-24 | 406 (5.1%) | 235 (6.0%) | 171 (4.2%) | |
| 25-29 | 408 (5.1%) | 219 (5.6%) | 189 (4.7%) | |
| 30-34 | 463 (5.8%) | 253 (6.4%) | 210 (5.2%) | |
| 35-39 | 565 (7.1%) | 263 (6.7%) | 302 (7.4%) | |
| 40-44 | 582 (7.3%) | 290 (7.4%) | 292 (7.2%) | |
| 45-49 | 518 (6.5%) | 266 (6.8%) | 252 (6.2%) | |
| 50-54 | 608 (7.6%) | 305 (7.7%) | 303 (7.5%) | |
| 55-59 | 660 (8.3%) | 308 (7.8%) | 352 (8.7%) | |
| 60-64 | 787 (9.8%) | 408 (10%) | 379 (9.3%) | |
| 65-69 | 901 (11%) | 418 (11%) | 483 (12%) | |
| 70-74 | 808 (10%) | 382 (9.7%) | 426 (10%) | |
| 75-79 | 663 (8.3%) | 325 (8.3%) | 338 (8.3%) | |
| 80+ | 631 (7.9%) | 264 (6.7%) | 367 (9.0%) | |
| Annual household income | <0.001 | |||
| Less than $15,000 | 407 (5.1%) | 160 (4.1%) | 247 (6.1%) | |
| $15,000 to $24,999 | 641 (8.0%) | 271 (6.9%) | 370 (9.1%) | |
| $25,000 to $34,999 | 871 (11%) | 376 (9.6%) | 495 (12%) | |
| $35,000 to $49,999 | 1,067 (13%) | 482 (12%) | 585 (14%) | |
| $50,000 to $99,999 | 2,511 (31%) | 1,251 (32%) | 1,260 (31%) | |
| $100,000 to $199,999 | 1,865 (23%) | 996 (25%) | 869 (21%) | |
| $200,000 or more | 638 (8.0%) | 400 (10%) | 238 (5.9%) | |
| Education | 0.003 | |||
| Less than high school | 124 (1.6%) | 75 (1.9%) | 49 (1.2%) | |
| High school diploma or GED | 252 (3.2%) | 130 (3.3%) | 122 (3.0%) | |
| Some college or technical school | 1,827 (23%) | 950 (24%) | 877 (22%) | |
| College graduate | 2,138 (27%) | 1,018 (26%) | 1,120 (28%) | |
| Graduate or professional degree | 3,659 (46%) | 1,763 (45%) | 1,896 (47%) | |
| Smoking status | <0.001 | |||
| Current daily smoker | 658 (8.2%) | 339 (8.6%) | 319 (7.8%) | |
| Current some-day smoker | 268 (3.4%) | 151 (3.8%) | 117 (2.9%) | |
| Former smoker | 2,262 (28%) | 1,207 (31%) | 1,055 (26%) | |
| Never smoker | 4,812 (60%) | 2,239 (57%) | 2,573 (63%) | |
| 1 Mean (SD); n (%) | ||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | ||||
model_full <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
summary(model_full)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.99937 0.93968 9.577 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseNo 0.65116 0.21472 3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000 to $24,999 -1.63703 0.46899 -3.491 0.000485
## income$25,000 to $34,999 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000 to $49,999 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000 to $99,999 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000 to $199,999 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseNo **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to $24,999 ***
## income$25,000 to $34,999 ***
## income$35,000 to $49,999 ***
## income$50,000 to $99,999 ***
## income$100,000 to $199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
# Extract and round coefficients
coefs <- round(coef(model_full), 3)
coefs
## (Intercept)
## 8.999
## physhlth_days
## 0.266
## bmi
## 0.033
## sexFemale
## 1.390
## exerciseNo
## 0.651
## age_group25-29
## -1.057
## age_group30-34
## -1.094
## age_group35-39
## -1.811
## age_group40-44
## -2.893
## age_group45-49
## -3.056
## age_group50-54
## -3.517
## age_group55-59
## -4.496
## age_group60-64
## -4.522
## age_group65-69
## -5.578
## age_group70-74
## -6.025
## age_group75-79
## -6.287
## age_group80+
## -6.820
## income$15,000 to $24,999
## -1.637
## income$25,000 to $34,999
## -2.076
## income$35,000 to $49,999
## -2.547
## income$50,000 to $99,999
## -3.050
## income$100,000 to $199,999
## -3.500
## income$200,000 or more
## -3.784
## educationHigh school diploma or GED
## 0.080
## educationSome college or technical school
## 1.077
## educationCollege graduate
## 1.791
## educationGraduate or professional degree
## 1.738
## smokingCurrent some-day smoker
## -1.587
## smokingFormer smoker
## -1.990
## smokingNever smoker
## -2.937
glance(model_full) |>
select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual) |>
kable(digits = 4, caption = "Model-Level Statistics") |>
kable_styling(bootstrap_options = c("striped", "hover"))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | df.residual |
|---|---|---|---|---|---|---|
| 0.1853 | 0.1824 | 7.3516 | 62.5234 | 0 | 29 | 7970 |
**R-squared: 0.1853 The model’s predictors (physical health days, BMI, sex, exercise, age group, income, education, and smoking status) collectively explain 18.53% of the variance in mentally unhealthy days among U.S. adults in this sample. This indicates that while the predictors are statistically meaningful, the majority of variation in mental health outcomes is attributable to factors not captured in this model.
**Adjusted R-squared:0.1824. The Adjusted R-squared is 0.1824, slightly lower than the R-squared of 0.1853. Unlike R-squared, which increases automatically whenever a predictor is added regardless of whether it improves the model, Adjusted R-squared penalizes for the number of predictors included. The small difference (0.0029) between the two suggests that the predictors in this model are genuinely contributing explanatory power and that the model is not being inflated by unnecessary variables. This distinction is especially important here because several categorical variables (age group, income, education) each contribute multiple dummy variables.
**Root MSE (Residual Standard Error):7.352.The residual standard error is 7.352 days. This means that, on average, the model’s predicted number of mentally unhealthy days differs from the observed value by approximately 7.35 days. Given that the outcome ranges from 0 to 30 days, this represents a meaningful degree of prediction error, consistent with the relatively modest R-squared.
Overall F-test: State H₀, report F-statistic, df (numerator and denominator), p-value, and conclusion. H₀: All regression coefficients are simultaneously equal to zero — that is, none of the predictors are associated with mentally unhealthy days (β₁ = β₂ = … = β₂₉ = 0) Hₐ: At least one regression coefficient is not equal to zero F-statistic: 62.52 Degrees of freedom: 29 (numerator) and 7,970 (denominator) p-value: < 2.2 × 10⁻¹⁶ Conclusion: We reject H₀ at α = 0.05. There is overwhelming statistical evidence that at least one predictor in the model is associated with the number of mentally unhealthy days, after accounting for all other predictors. The model as a whole fits significantly better than an intercept-only (null) model.
type3_results <- car::Anova(model_full, type = "III")
type3_results
## Anova Table (Type III tests)
##
## Response: menthlth_days
## Sum Sq Df F value Pr(>F)
## (Intercept) 4957 1 91.7191 < 2.2e-16 ***
## physhlth_days 37623 1 696.1198 < 2.2e-16 ***
## bmi 345 1 6.3879 0.0115095 *
## sex 3724 1 68.9012 < 2.2e-16 ***
## exercise 497 1 9.1966 0.0024325 **
## age_group 30092 12 46.3986 < 2.2e-16 ***
## income 4734 6 14.5982 < 2.2e-16 ***
## education 1265 4 5.8521 0.0001064 ***
## smoking 5101 3 31.4613 < 2.2e-16 ***
## Residuals 430750 7970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Display as formatted table
type3_results |>
as.data.frame() |>
rownames_to_column("Predictor") |>
kable(digits = 4, caption = "Type III (Partial) Sums of Squares") |>
kable_styling(bootstrap_options = c("striped", "hover"))
| Predictor | Sum Sq | Df | F value | Pr(>F) |
|---|---|---|---|---|
| (Intercept) | 4957.0906 | 1 | 91.7191 | 0.0000 |
| physhlth_days | 37622.7952 | 1 | 696.1198 | 0.0000 |
| bmi | 345.2408 | 1 | 6.3879 | 0.0115 |
| sex | 3723.8662 | 1 | 68.9012 | 0.0000 |
| exercise | 497.0434 | 1 | 9.1966 | 0.0024 |
| age_group | 30092.1774 | 12 | 46.3986 | 0.0000 |
| income | 4733.8943 | 6 | 14.5982 | 0.0000 |
| education | 1265.1504 | 4 | 5.8521 | 0.0001 |
| smoking | 5101.1076 | 3 | 31.4613 | 0.0000 |
| Residuals | 430750.0872 | 7970 | NA | NA |
# Reduced model excluding income
model_no_income <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + education + smoking,
data = brfss_analytic
)
# Partial F-test comparing reduced vs. full
anova(model_no_income, model_full)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7976 435484
## 2 7970 430750 6 4733.9 14.598 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H₀: After accounting for all other predictors in the model, the income coefficients are jointly equal to zero — that is, income (as a group) is not associated with mentally unhealthy days (β $15k–$25k = β$25k–$35k = β$35k–$50k = β$50k–$100k = β$100k–$200k = β$200k+ = 0) Hₐ: At least one income coefficient is not equal to zero — that is, income collectively improves model fit after accounting for all other predictors F-statistic: 14.598 Degrees of freedom: 6 (numerator — the number of income dummy variables being tested) and 7,970 (denominator — residual df from the full model) p-value: < 2.2 × 10⁻¹⁶ Conclusion: We reject H₀ at α = 0.05. There is very strong statistical evidence that income, as a group of predictors, is significantly associated with mentally unhealthy days after accounting for physical health days, BMI, sex, exercise, age group, education, and smoking status. Removing income from the model results in a significantly worse fit, indicating that income collectively contributes meaningful explanatory power beyond what the other predictors provide.
# Reduced model excluding education
model_no_education <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + smoking,
data = brfss_analytic
)
# Partial F-test comparing reduced vs. full
anova(model_no_education, model_full)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group +
## income + education + smoking
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7974 432015
## 2 7970 430750 4 1265.2 5.8521 0.0001064 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H₀: After accounting for all other predictors in the model (physical health days, BMI, sex, exercise, age group, income, and smoking), the education coefficients are all simultaneously equal to zero — that is, education as a group is not associated with mentally unhealthy days (β_HS = β_SomeCollege = β_CollegeGrad = β_GradDegree = 0) Hₐ: At least one education coefficient is not equal to zero — that is, education as a group adds statistically significant explanatory power after accounting for all other predictors F-statistic: 5.8521 Degrees of freedom: 4 (numerator) and 7,970 (denominator) p-value: 0.0001064 Conclusion: We reject H₀ at α = 0.05. There is strong statistical evidence that education, considered as a group of predictors, is significantly associated with mentally unhealthy days after accounting for all other variables in the model. Adding the four education dummy variables to the model produces a meaningful improvement in fit beyond what physical health, BMI, sex, exercise, age group, income, and smoking already explain.
Based on the Type III partial F-tests, physical health days, sex, exercise, and smoking status emerged as the strongest independent contributors to mentally unhealthy days, each showing highly statistically significant partial associations (p < 0.05) after accounting for all other predictors in the model. Age group also reached significance as a group, while the contributions of BMI were more modest. The chunk test for income — which jointly tested all six income dummy variables simultaneously revealed that income as a whole significantly improved model fit beyond the other predictors even if some individual income levels did not appear strongly significant in the individual t-test. this matters because a categorical variable with many levels can collectively explain meaningful variance even when no single level clears the significance threshold on its own. Similarly, the chunk test for education tested whether the four education dummy variables together added explanatory power, providing a single, more statistically powerful test of education’s overall contribution rather than fragmenting its effect across four separate comparisons. Together, the chunk tests complement the Type III results by answering a different and arguably more relevant question — not “is this specific level different from the reference?” but “does this entire variable belong in the model?” which is the appropriate inferential question when a categorical predictor is treated as a conceptual unit.
brfss_analytic <- brfss_analytic |>
mutate(
smoker_current = factor(
case_when(
smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
),
levels = c("Non-smoker", "Current smoker") # Non-smoker as reference
)
)
# Verify coding
table(brfss_analytic$smoker_current, useNA = "ifany")
##
## Non-smoker Current smoker
## 7074 926
# Model A: main effects only
model_A <- lm(
menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
# Model B: with sex * smoker_current interaction
model_B <- lm(
menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
summary(model_B)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.337 -3.837 -1.604 0.628 30.426
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.22653 0.90967 6.845 8.23e-12
## physhlth_days 0.26859 0.01007 26.679 < 2e-16
## bmi 0.03309 0.01323 2.502 0.012380
## sexFemale 1.18553 0.17752 6.678 2.58e-11
## smoker_currentCurrent smoker 1.52079 0.36539 4.162 3.19e-05
## exerciseNo 0.67285 0.21496 3.130 0.001753
## age_group25-29 -0.92018 0.51962 -1.771 0.076616
## age_group30-34 -0.89242 0.50591 -1.764 0.077774
## age_group35-39 -1.59287 0.48752 -3.267 0.001090
## age_group40-44 -2.62864 0.48564 -5.413 6.39e-08
## age_group45-49 -2.84255 0.49687 -5.721 1.10e-08
## age_group50-54 -3.27784 0.48195 -6.801 1.11e-11
## age_group55-59 -4.24987 0.47404 -8.965 < 2e-16
## age_group60-64 -4.26404 0.45671 -9.336 < 2e-16
## age_group65-69 -5.25062 0.44662 -11.756 < 2e-16
## age_group70-74 -5.71106 0.45439 -12.569 < 2e-16
## age_group75-79 -5.90758 0.47018 -12.565 < 2e-16
## age_group80+ -6.49946 0.47359 -13.724 < 2e-16
## income$15,000 to $24,999 -1.63574 0.46999 -3.480 0.000503
## income$25,000 to $34,999 -2.07457 0.44862 -4.624 3.82e-06
## income$35,000 to $49,999 -2.54551 0.43915 -5.796 7.03e-09
## income$50,000 to $99,999 -3.04298 0.41157 -7.394 1.57e-13
## income$100,000 to $199,999 -3.50972 0.42999 -8.162 3.79e-16
## income$200,000 or more -3.84047 0.50103 -7.665 2.00e-14
## educationHigh school diploma or GED 0.12556 0.81238 0.155 0.877177
## educationSome college or technical school 1.11789 0.69123 1.617 0.105865
## educationCollege graduate 1.81788 0.69283 2.624 0.008711
## educationGraduate or professional degree 1.66905 0.69428 2.404 0.016240
## sexFemale:smoker_currentCurrent smoker 1.28327 0.51705 2.482 0.013089
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## smoker_currentCurrent smoker ***
## exerciseNo **
## age_group25-29 .
## age_group30-34 .
## age_group35-39 **
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to $24,999 ***
## income$25,000 to $34,999 ***
## income$35,000 to $49,999 ***
## income$50,000 to $99,999 ***
## income$100,000 to $199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## sexFemale:smoker_currentCurrent smoker *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.363 on 7971 degrees of freedom
## Multiple R-squared: 0.1826, Adjusted R-squared: 0.1798
## F-statistic: 63.61 on 28 and 7971 DF, p-value: < 2.2e-16
anova(model_A, model_B)
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7972 432509
## 2 7971 432175 1 333.97 6.1598 0.01309 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
H₀: The association between current smoking status and mentally unhealthy days does not differ by sex — i.e., the interaction coefficient β(sex × smoker_current) = 0. Hₐ: The association between current smoking status and mentally unhealthy days does differ by sex — i.e., β(sex × smoker_current) ≠ 0.
F-statistic: 6.16 Degrees of freedom: 1 (numerator), 7971 (denominator) p-value: 0.0131 Conclusion: At α = 0.05, we reject H₀ (p = 0.013 < 0.05). There is statistically significant evidence that the association between current smoking and the number of mentally unhealthy days differs by sex. The interaction term improves model fit beyond the main effects alone, supporting the presence of effect modification by sex.
# Extract relevant coefficients
tidy(model_B) |>
filter(str_detect(term, "smoker_current|sex")) |>
kable(digits = 4, caption = "Interaction-Related Coefficients from Model B") |>
kable_styling(bootstrap_options = c("striped", "hover"))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| sexFemale | 1.1855 | 0.1775 | 6.6784 | 0.0000 |
| smoker_currentCurrent smoker | 1.5208 | 0.3654 | 4.1621 | 0.0000 |
| sexFemale:smoker_currentCurrent smoker | 1.2833 | 0.5171 | 2.4819 | 0.0131 |
b_smoker <- coef(model_B)["smoker_currentCurrent smoker"]
b_interact <- coef(model_B)["sexFemale:smoker_currentCurrent smoker"]
cat("Effect of current smoking among Men (reference):", round(b_smoker, 3), "\n")
## Effect of current smoking among Men (reference): 1.521
cat("Effect of current smoking among Women:", round(b_smoker + b_interact, 3), "\n")
## Effect of current smoking among Women: 2.804
Among men: The coefficient for smoker_currentCurrent smoker is 1.521. Among men, current smokers are associated with 1.521 more mentally unhealthy days in the past 30 days compared to non-smokers, holding all other variables constant.
Among women: The estimated effect among women is computed as: 1.5208 + 1.2833 = 2.8041 Among women, current smokers are associated with 2.804 more mentally unhealthy days in the past 30 days compared to non-smokers, holding all other variables constant.
The association between current smoking and mentally unhealthy days is stronger among women than among men. Specifically, the estimated difference between current smokers and non-smokers is approximately 2.804 days among women versus 1.521 days among men a difference of about 1.283 days, which is precisely the interaction coefficient. This suggests that sex modifies the association between smoking and mental health, with women who smoke reporting a notably larger number of mentally unhealthy days relative to non-smoking women than is seen among men
# Generate predicted values
pred_interaction <- ggpredict(model_B, terms = c("smoker_current", "sex"))
# Plot
plot(pred_interaction) +
labs(
title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
x = "Smoking Status",
y = "Predicted Mentally Unhealthy Days (Past 30)",
color = "Sex"
) +
theme_minimal(base_size = 13)
The findings indicate that physical health, smoking status, sex, and income are among the most influential predictors of mentally unhealthy days in this sample. Physically unhealthy days exhibited the strongest positive relationship with mental health burden, reinforcing the well-documented connection between physical and mental well-being. Being female and currently smoking were both independently linked to a higher number of mentally unhealthy days, whereas higher income was associated with fewer such days. In contrast, variables like BMI and certain education categories showed weaker or non-significant associations, suggesting that their effects on mental health may be more complex or operate through indirect pathways. A major limitation of cross-sectional data is that exposures and outcomes are measured simultaneously, preventing the establishment of temporal order and leaving open the possibility of reverse causation—for instance, poor mental health could contribute to smoking rather than result from it. Additionally, the BRFSS relies on self-reported information, which may introduce recall bias and social desirability bias. Two notable confounders that were not accounted for include (1) prior diagnosis of mental illness, which is likely associated with both smoking and increased mentally unhealthy days, and (2) social support or relationship status, which can influence mental health and may also be related to smoking behavior and sex.
Regular R-squared increases mechanically every time a predictor is added to a model, even if that predictor has no true relationship with the outcome. Adjusted R-squared penalizes for the number of predictors, only increasing when a new variable improves fit more than expected by chance. This distinction is especially important here because categorical variables like age group (12 dummy variables), income (6 dummies), and education (4 dummies) collectively add many degrees of freedom — R-squared would inflate substantially even if these variables contributed little real explanatory power. Chunk tests (partial F-tests) are valuable because categorical variables with many levels are represented by multiple dummy coefficients in the model. Evaluating each level’s individual t-test in summary() only tells you whether that specific category differs from the reference — it does not tell you whether the variable as a whole improves model fit. A chunk test jointly tests all levels simultaneously, answering the more meaningful question: does income (or education) as a construct belong in the model at all, after accounting for all other predictors?
I used AI in code debugging.I also used AI to understand the data set as the variables name is provided in the assignment instruction in capital letter where as the original data set have all the variables written in small letter. I asked AI why my data set was not loading. It suggested me to run names() to see all the variables that how I found out why the error was happening.