#Part 0: Data Acquisition and Preparation
#Load Required Packages
#Import Raw BRFSS Data
brfss_raw <- read_xpt("D:/fizza documents/Epi 553/Assignments/HW03/LLCP2023.XPT")
# Report dimensions
total_rows <- nrow(brfss_raw)
total_cols <- ncol(brfss_raw)
cat("Total rows in raw dataset:", total_rows, "\n")
## Total rows in raw dataset: 433323
cat("Total columns in raw dataset:", total_cols, "\n")
## Total columns in raw dataset: 350
#Select Required Variables
brfss_selected <- brfss_raw |>
select(
menthlth_days = MENTHLTH,
physhlth_days = PHYSHLTH,
bmi = `_BMI5`,
sex = SEXVAR, # SEXVAR: 1=Male, 2=Female
exercise = EXERANY2,
age_group = `_AGEG5YR`,
income = INCOME3,
education = EDUCA,
smoking = SMOKDAY2
)
glimpse(brfss_selected)
## Rows: 433,323
## Columns: 9
## $ menthlth_days <dbl> 88, 88, 2, 88, 88, 3, 77, 88, 88, 88, 88, 88, 88, 88, 88…
## $ physhlth_days <dbl> 88, 88, 6, 2, 88, 2, 8, 1, 5, 88, 88, 2, 88, 88, 88, 4, …
## $ bmi <dbl> 3047, 2856, 2231, 2744, 2585, 3018, 2441, 2727, 3347, 22…
## $ sex <dbl> 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2,…
## $ exercise <dbl> 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 7, 2, 2,…
## $ age_group <dbl> 13, 13, 13, 12, 12, 9, 13, 12, 13, 12, 8, 12, 10, 12, 12…
## $ income <dbl> 99, 99, 2, 99, 7, 7, 6, 99, 6, 7, 7, 99, 9, 9, 9, 3, 77,…
## $ education <dbl> 5, 5, 4, 5, 5, 5, 4, 5, 5, 4, 5, 4, 6, 5, 5, 4, 6, 6, 5,…
## $ smoking <dbl> NA, NA, 3, NA, NA, NA, 3, NA, NA, 3, NA, NA, 3, 3, 3, NA…
#Recode All Variables
brfss_clean <- brfss_selected |>
mutate(
# Clean outcome
menthlth_days = ifelse(menthlth_days %in% c(88, 99), NA, menthlth_days),
# Clean physical health days
physhlth_days = ifelse(physhlth_days %in% c(88, 99), NA, physhlth_days),
# Clean BMI (divide by 100, remove 9999)
bmi = ifelse(bmi == 9999, NA, bmi / 100),
# Convert to factors with labels
sex = factor(sex, levels = c(1, 2), labels = c("Male", "Female")),
exercise = factor(exercise, levels = c(1, 2), labels = c("Yes", "No")),
age_group = factor(age_group,
levels = 1:13,
labels = c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49",
"50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+")),
income = factor(income,
levels = 1:6,
labels = c("Less than $15,000", "$15,000-$24,999", "$25,000-$34,999",
"$35,000-$49,999", "$50,000-$74,999", "$75,000 or more")),
education = factor(education,
levels = 1:6,
labels = c("Never attended", "Grades 1-8", "Grades 9-11",
"Grade 12 or GED", "College 1-3 years", "College 4+ years")),
smoking = factor(smoking,
levels = 1:4,
labels = c("Current daily", "Current some days", "Former", "Never"))
) |>
# Remove rows with missing income (codes 77, 99 become NA after factor conversion)
filter(!is.na(income))
cat("Rows after cleaning:", nrow(brfss_clean), "\n")
## Rows after cleaning: 136266
#Missing Data Report
data.frame(
Variable = c("menthlth_days", "bmi", "physhlth_days", "exercise"),
Missing_N = c(
sum(is.na(brfss_clean$menthlth_days)),
sum(is.na(brfss_clean$bmi)),
sum(is.na(brfss_clean$physhlth_days)),
sum(is.na(brfss_clean$exercise))
),
Missing_Pct = c(
mean(is.na(brfss_clean$menthlth_days)) * 100,
mean(is.na(brfss_clean$bmi)) * 100,
mean(is.na(brfss_clean$physhlth_days)) * 100,
mean(is.na(brfss_clean$exercise)) * 100
)
) |> kable(caption = "Missing Data") |> kable_styling()
| Variable | Missing_N | Missing_Pct |
|---|---|---|
| menthlth_days | 75484 | 55.3945959 |
| bmi | 8585 | 6.3001776 |
| physhlth_days | 70471 | 51.7157618 |
| exercise | 413 | 0.3030837 |
#Create Analytic Sample (n=8,000)
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
cat("Final analytic N =", nrow(brfss_analytic), "\n")
## Final analytic N = 8000
#Descriptive Statistics
brfss_analytic |>
select(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
tbl_summary(by = sex, statistic = all_continuous() ~ "{mean} ({sd})") |>
add_overall() |>
modify_caption("Table 1: Descriptive Statistics by Sex")
| Characteristic | Overall N = 8,0001 |
Male N = 3,2341 |
Female N = 4,7661 |
|---|---|---|---|
| menthlth_days | 18 (15) | 18 (16) | 18 (14) |
| physhlth_days | 18 (16) | 19 (16) | 18 (16) |
| bmi | 30 (8) | 29 (7) | 30 (9) |
| exercise | 4,370 (55%) | 1,851 (57%) | 2,519 (53%) |
| age_group | |||
| 18-24 | 231 (2.9%) | 127 (3.9%) | 104 (2.2%) |
| 25-29 | 252 (3.2%) | 131 (4.1%) | 121 (2.5%) |
| 30-34 | 423 (5.3%) | 184 (5.7%) | 239 (5.0%) |
| 35-39 | 522 (6.5%) | 207 (6.4%) | 315 (6.6%) |
| 40-44 | 569 (7.1%) | 230 (7.1%) | 339 (7.1%) |
| 45-49 | 562 (7.0%) | 199 (6.2%) | 363 (7.6%) |
| 50-54 | 737 (9.2%) | 265 (8.2%) | 472 (9.9%) |
| 55-59 | 924 (12%) | 367 (11%) | 557 (12%) |
| 60-64 | 1,122 (14%) | 440 (14%) | 682 (14%) |
| 65-69 | 998 (12%) | 410 (13%) | 588 (12%) |
| 70-74 | 748 (9.4%) | 308 (9.5%) | 440 (9.2%) |
| 75-79 | 485 (6.1%) | 178 (5.5%) | 307 (6.4%) |
| 80+ | 427 (5.3%) | 188 (5.8%) | 239 (5.0%) |
| income | |||
| Less than $15,000 | 781 (9.8%) | 319 (9.9%) | 462 (9.7%) |
| $15,000-$24,999 | 954 (12%) | 374 (12%) | 580 (12%) |
| $25,000-$34,999 | 956 (12%) | 379 (12%) | 577 (12%) |
| $35,000-$49,999 | 1,223 (15%) | 446 (14%) | 777 (16%) |
| $50,000-$74,999 | 2,021 (25%) | 823 (25%) | 1,198 (25%) |
| $75,000 or more | 2,065 (26%) | 893 (28%) | 1,172 (25%) |
| education | |||
| Never attended | 8 (0.1%) | 8 (0.2%) | 0 (0%) |
| Grades 1-8 | 211 (2.6%) | 137 (4.2%) | 74 (1.6%) |
| Grades 9-11 | 767 (9.6%) | 348 (11%) | 419 (8.8%) |
| Grade 12 or GED | 2,966 (37%) | 1,263 (39%) | 1,703 (36%) |
| College 1-3 years | 2,773 (35%) | 1,005 (31%) | 1,768 (37%) |
| College 4+ years | 1,275 (16%) | 473 (15%) | 802 (17%) |
| smoking | |||
| Current daily | 2,597 (32%) | 1,043 (32%) | 1,554 (33%) |
| Current some days | 1,038 (13%) | 430 (13%) | 608 (13%) |
| Former | 4,365 (55%) | 1,761 (54%) | 2,604 (55%) |
| Never | 0 (0%) | 0 (0%) | 0 (0%) |
| 1 Mean (SD); n (%) | |||
#Part 1: Multiple Linear Regression
model_full <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic)
summary(model_full)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.033 -9.430 -2.114 8.035 68.023
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.71175 5.12664 3.260 0.001120 **
## physhlth_days 0.31112 0.01029 30.243 < 2e-16 ***
## bmi -0.03723 0.01986 -1.875 0.060891 .
## sexFemale -0.34181 0.32546 -1.050 0.293640
## exerciseNo 1.75414 0.33220 5.280 1.32e-07 ***
## age_group25-29 0.74817 1.28744 0.581 0.561169
## age_group30-34 -0.42361 1.16038 -0.365 0.715078
## age_group35-39 -0.10379 1.12418 -0.092 0.926441
## age_group40-44 -1.50683 1.11181 -1.355 0.175362
## age_group45-49 -0.22875 1.12162 -0.204 0.838398
## age_group50-54 -2.50934 1.08438 -2.314 0.020688 *
## age_group55-59 -3.71878 1.05746 -3.517 0.000439 ***
## age_group60-64 -3.27140 1.03844 -3.150 0.001637 **
## age_group65-69 -3.21318 1.04762 -3.067 0.002169 **
## age_group70-74 -5.51356 1.08231 -5.094 3.58e-07 ***
## age_group75-79 -4.31956 1.14847 -3.761 0.000170 ***
## age_group80+ -3.71461 1.17848 -3.152 0.001627 **
## income$15,000-$24,999 0.80752 0.68585 1.177 0.239072
## income$25,000-$34,999 -0.53927 0.68761 -0.784 0.432911
## income$35,000-$49,999 -0.98112 0.65694 -1.493 0.135354
## income$50,000-$74,999 -1.47223 0.60862 -2.419 0.015588 *
## income$75,000 or more -1.82754 0.61662 -2.964 0.003048 **
## educationGrades 1-8 3.66358 5.07996 0.721 0.470819
## educationGrades 9-11 2.74554 5.01490 0.547 0.584067
## educationGrade 12 or GED 0.75134 4.99723 0.150 0.880491
## educationCollege 1-3 years 0.15661 4.99880 0.031 0.975008
## educationCollege 4+ years -0.52887 5.00886 -0.106 0.915912
## smokingCurrent some days -1.42597 0.52057 -2.739 0.006172 **
## smokingFormer -1.40730 0.36878 -3.816 0.000137 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.08 on 7971 degrees of freedom
## Multiple R-squared: 0.1404, Adjusted R-squared: 0.1373
## F-statistic: 46.48 on 28 and 7971 DF, p-value: < 2.2e-16
#Fitted Regression Equation
# Extract all coefficients rounded to 3 decimal places
coefs <- round(coef(model_full), 3)
# Display full equation
cat("**Fitted Regression Equation:**\n\n")
## **Fitted Regression Equation:**
cat("Predicted Mentally Unhealthy Days = ", coefs[1])
## Predicted Mentally Unhealthy Days = 16.712
# Loop through all coefficients (except intercept)
for(i in 2:length(coefs)) {
cat(" + ", coefs[i], "×", names(coefs)[i])
}
## + 0.311 × physhlth_days + -0.037 × bmi + -0.342 × sexFemale + 1.754 × exerciseNo + 0.748 × age_group25-29 + -0.424 × age_group30-34 + -0.104 × age_group35-39 + -1.507 × age_group40-44 + -0.229 × age_group45-49 + -2.509 × age_group50-54 + -3.719 × age_group55-59 + -3.271 × age_group60-64 + -3.213 × age_group65-69 + -5.514 × age_group70-74 + -4.32 × age_group75-79 + -3.715 × age_group80+ + 0.808 × income$15,000-$24,999 + -0.539 × income$25,000-$34,999 + -0.981 × income$35,000-$49,999 + -1.472 × income$50,000-$74,999 + -1.828 × income$75,000 or more + 3.664 × educationGrades 1-8 + 2.746 × educationGrades 9-11 + 0.751 × educationGrade 12 or GED + 0.157 × educationCollege 1-3 years + -0.529 × educationCollege 4+ years + -1.426 × smokingCurrent some days + -1.407 × smokingFormer
cat("\n")
#Interpret Selected Coefficients
cat("**Coefficient Interpretations (holding all other variables constant):**\n\n")
## **Coefficient Interpretations (holding all other variables constant):**
# Get coefficients
coef_physhlth <- coef(model_full)["physhlth_days"]
coef_bmi <- coef(model_full)["bmi"]
cat("**physhlth_days (continuous):**\n")
## **physhlth_days (continuous):**
cat("For each additional day of physically unhealthy days, the number of\n")
## For each additional day of physically unhealthy days, the number of
cat("mentally unhealthy days is estimated to change by", round(coef_physhlth, 3), "days.\n\n")
## mentally unhealthy days is estimated to change by 0.311 days.
cat("**bmi (continuous):**\n")
## **bmi (continuous):**
cat("For each one-unit increase in BMI, the number of mentally unhealthy days\n")
## For each one-unit increase in BMI, the number of mentally unhealthy days
cat("is estimated to", ifelse(coef_bmi > 0, "increase by", "decrease by"),
abs(round(coef_bmi, 3)), "days.\n\n")
## is estimated to decrease by 0.037 days.
# Sex coefficient
if("sexFemale" %in% names(coef(model_full))) {
coef_sex <- coef(model_full)["sexFemale"]
cat("**sex: Female vs. Male (reference):**\n")
cat("Females have", abs(round(coef_sex, 3)),
ifelse(coef_sex > 0, "more", "fewer"),
"mentally unhealthy days compared to males.\n\n")
}
## **sex: Female vs. Male (reference):**
## Females have 0.342 fewer mentally unhealthy days compared to males.
# Exercise coefficient
if("exerciseYes" %in% names(coef(model_full))) {
coef_exercise <- coef(model_full)["exerciseYes"]
cat("**exercise: Yes vs. No (reference):**\n")
cat("Individuals who exercise have", abs(round(coef_exercise, 3)),
ifelse(coef_exercise > 0, "more", "fewer"),
"mentally unhealthy days compared to those who do not exercise.\n\n")
}
# Age group example - pick a significant one
age_coefs <- names(coef(model_full))[grepl("age_group", names(coef(model_full)))]
if(length(age_coefs) > 0) {
cat("**Age group (", age_coefs[1], " vs. 18-24 reference):**\n")
cat("This age group has", abs(round(coef(model_full)[age_coefs[1]], 3)),
ifelse(coef(model_full)[age_coefs[1]] > 0, "more", "fewer"),
"mentally unhealthy days compared to adults aged 18-24.\n\n")
}
## **Age group ( age_group25-29 vs. 18-24 reference):**
## This age group has 0.748 more mentally unhealthy days compared to adults aged 18-24.
# Income coefficients
if("income$50,000-$74,999" %in% names(coef(model_full))) {
cat("**Income ($50,000-$74,999 vs. Less than $15,000 reference):**\n")
cat("Adults earning $50,000-$74,999 have",
abs(round(coef(model_full)["income$50,000-$74,999"], 3)),
ifelse(coef(model_full)["income$50,000-$74,999"] > 0, "more", "fewer"),
"mentally unhealthy days compared to those earning less than $15,000.\n\n")
}
## **Income ($50,000-$74,999 vs. Less than $15,000 reference):**
## Adults earning $50,000-$74,999 have 1.472 fewer mentally unhealthy days compared to those earning less than $15,000.
if("income$75,000 or more" %in% names(coef(model_full))) {
cat("**Income ($75,000 or more vs. Less than $15,000 reference):**\n")
cat("Adults earning $75,000 or more have",
abs(round(coef(model_full)["income$75,000 or more"], 3)),
ifelse(coef(model_full)["income$75,000 or more"] > 0, "more", "fewer"),
"mentally unhealthy days compared to those earning less than $15,000.\n\n")
}
## **Income ($75,000 or more vs. Less than $15,000 reference):**
## Adults earning $75,000 or more have 1.828 fewer mentally unhealthy days compared to those earning less than $15,000.
#Report and Interpret Model-Level Statistics
r2 <- summary(model_full)$r.squared
adj_r2 <- summary(model_full)$adj.r.squared
rse <- summary(model_full)$sigma
f_stat <- summary(model_full)$fstatistic
f_pvalue <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
cat("**Model Summary Statistics:**\n\n")
## **Model Summary Statistics:**
cat("**R-squared:**", round(r2, 4), "\n")
## **R-squared:** 0.1404
cat("-", round(r2 * 100, 1), "% of the variance in mentally unhealthy days is explained by all predictors.\n\n")
## - 14 % of the variance in mentally unhealthy days is explained by all predictors.
cat("**Adjusted R-squared:**", round(adj_r2, 4), "\n")
## **Adjusted R-squared:** 0.1373
cat("- This penalizes the R-squared for the number of predictors, providing a more conservative estimate.\n\n")
## - This penalizes the R-squared for the number of predictors, providing a more conservative estimate.
cat("**Root MSE (Residual Standard Error):**", round(rse, 3), "\n")
## **Root MSE (Residual Standard Error):** 14.082
cat("- On average, the model's predictions are off by about", round(rse, 2), "mentally unhealthy days.\n\n")
## - On average, the model's predictions are off by about 14.08 mentally unhealthy days.
cat("\n**Overall F-test:**\n")
##
## **Overall F-test:**
cat("H0: All regression coefficients (except intercept) equal 0\n")
## H0: All regression coefficients (except intercept) equal 0
cat("HA: At least one regression coefficient is not equal to 0\n")
## HA: At least one regression coefficient is not equal to 0
f_stat <- summary(model_full)$fstatistic
f_pvalue <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
cat("F-statistic:", round(f_stat[1], 2), "\n")
## F-statistic: 46.48
cat("Degrees of freedom:", f_stat[2], "(numerator),", f_stat[3], "(denominator)\n")
## Degrees of freedom: 28 (numerator), 7971 (denominator)
cat("p-value:", format.pval(f_pvalue, digits = 4), "\n")
## p-value: < 2.2e-16
cat("Conclusion: Reject H0. The model with all predictors significantly improves prediction.\n")
## Conclusion: Reject H0. The model with all predictors significantly improves prediction.
#Part 2: Tests of Hypothesis #Type III Sums of Squares
model_type3 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic,
contrasts = list(sex = contr.sum, exercise = contr.sum,
age_group = contr.sum, income = contr.sum,
education = contr.sum, smoking = contr.sum))
Anova(model_type3, type = "III")
## Anova Table (Type III tests)
##
## Response: menthlth_days
## Sum Sq Df F value Pr(>F)
## (Intercept) 38579 1 194.5567 < 2.2e-16 ***
## physhlth_days 181358 1 914.6133 < 2.2e-16 ***
## bmi 697 1 3.5139 0.0608910 .
## sex 219 1 1.1030 0.2936395
## exercise 5529 1 27.8830 1.323e-07 ***
## age_group 21701 12 9.1202 < 2.2e-16 ***
## income 5414 5 5.4612 5.066e-05 ***
## education 7152 5 7.2133 9.533e-07 ***
## smoking 3222 2 8.1253 0.0002984 ***
## Residuals 1580566 7971
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Chunk Tests
# Chunk test for income
cat("\n--- Chunk Test: Income ---\n")
##
## --- Chunk Test: Income ---
cat("H0: All income coefficients are simultaneously equal to 0\n")
## H0: All income coefficients are simultaneously equal to 0
cat("HA: At least one income coefficient is not equal to 0\n\n")
## HA: At least one income coefficient is not equal to 0
model_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + education + smoking, data = brfss_analytic)
anova_income <- anova(model_no_income, model_full)
cat("F-statistic:", round(anova_income$F[2], 2), "\n")
## F-statistic: 5.46
cat("Degrees of freedom:", anova_income$Df[2], "(numerator),",
anova_income$Res.Df[2], "(denominator)\n")
## Degrees of freedom: 5 (numerator), 7971 (denominator)
cat("p-value:", format.pval(anova_income$`Pr(>F)`[2], digits = 4), "\n")
## p-value: 5.066e-05
if(anova_income$`Pr(>F)`[2] < 0.05) {
cat("Conclusion: Reject H0. Income collectively improves the model significantly.\n")
} else {
cat("Conclusion: Fail to reject H0. Income does not significantly improve the model.\n")
}
## Conclusion: Reject H0. Income collectively improves the model significantly.
# Chunk test for education
cat("\n--- Chunk Test: Education ---\n")
##
## --- Chunk Test: Education ---
cat("H0: All education coefficients are simultaneously equal to 0\n")
## H0: All education coefficients are simultaneously equal to 0
cat("HA: At least one education coefficient is not equal to 0\n\n")
## HA: At least one education coefficient is not equal to 0
model_no_education <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + smoking, data = brfss_analytic)
anova_education <- anova(model_no_education, model_full)
cat("F-statistic:", round(anova_education$F[2], 2), "\n")
## F-statistic: 7.21
cat("Degrees of freedom:", anova_education$Df[2], "(numerator),",
anova_education$Res.Df[2], "(denominator)\n")
## Degrees of freedom: 5 (numerator), 7971 (denominator)
cat("p-value:", format.pval(anova_education$`Pr(>F)`[2], digits = 4), "\n")
## p-value: 9.533e-07
if(anova_education$`Pr(>F)`[2] < 0.05) {
cat("Conclusion: Reject H0. Education collectively improves the model significantly.\n")
} else {
cat("Conclusion: Fail to reject H0. Education does not significantly improve the model.\n")
}
## Conclusion: Reject H0. Education collectively improves the model significantly.
#Written Synthesis
cat("**Synthesis of Type III and Chunk Test Findings**\n\n")
## **Synthesis of Type III and Chunk Test Findings**
cat("The Type III partial F-tests show which predictors have unique associations with\n")
## The Type III partial F-tests show which predictors have unique associations with
cat("mentally unhealthy days after accounting for all other variables. The chunk tests\n")
## mentally unhealthy days after accounting for all other variables. The chunk tests
cat("complement this by testing whether income and education as complete groups\n")
## complement this by testing whether income and education as complete groups
cat("contribute meaningfully to the model.\n\n")
## contribute meaningfully to the model.
cat("These chunk tests add value beyond individual t-tests because they assess the\n")
## These chunk tests add value beyond individual t-tests because they assess the
cat("joint contribution of all dummy variables representing a categorical predictor,\n")
## joint contribution of all dummy variables representing a categorical predictor,
cat("avoiding multiple comparisons problems and providing a single answer about\n")
## avoiding multiple comparisons problems and providing a single answer about
cat("whether the predictor as a whole is associated with the outcome.\n")
## whether the predictor as a whole is associated with the outcome.
#Part 3: Interaction Analysis
cat("\n**Test of Interaction between Sex and Smoking Status**\n\n")
##
## **Test of Interaction between Sex and Smoking Status**
cat("H0: The interaction term between sex and smoker_current equals 0\n")
## H0: The interaction term between sex and smoker_current equals 0
cat(" (The association between smoking and mental health does NOT differ by sex)\n")
## (The association between smoking and mental health does NOT differ by sex)
cat("HA: The interaction term is not equal to 0\n")
## HA: The interaction term is not equal to 0
cat(" (The association between smoking and mental health DOES differ by sex)\n\n")
## (The association between smoking and mental health DOES differ by sex)
# Create binary smoking variable
brfss_analytic <- brfss_analytic |>
mutate(smoker_current = factor(
case_when(
smoking %in% c("Current daily", "Current some days") ~ "Current smoker",
TRUE ~ "Non-smoker"
),
levels = c("Non-smoker", "Current smoker")
))
# Main effects model
model_A <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
exercise + age_group + income + education, data = brfss_analytic)
# Interaction model
model_B <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
exercise + age_group + income + education, data = brfss_analytic)
# Test interaction
cat("\n--- Test for Interaction ---\n")
##
## --- Test for Interaction ---
print(anova(model_A, model_B))
## Analysis of Variance Table
##
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
## exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
## exercise + age_group + income + education
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7972 1582054
## 2 7971 1582030 1 24.058 0.1212 0.7277
# Visualize
ggpredict(model_B, terms = c("smoker_current", "sex")) |>
plot() +
labs(title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
x = "Smoking Status", y = "Predicted Mentally Unhealthy Days") +
theme_minimal()
## Ignoring unknown labels:
## • linetype : "sex"
## • shape : "sex"
#Test Interaction Significance
anova_interaction <- anova(model_A, model_B)
cat("**Test of Interaction between Sex and Smoking Status**\n\n")
## **Test of Interaction between Sex and Smoking Status**
cat("H0: The interaction term between sex and smoker_current equals 0\n")
## H0: The interaction term between sex and smoker_current equals 0
cat(" (The association between smoking and mental health does NOT differ by sex)\n")
## (The association between smoking and mental health does NOT differ by sex)
cat("HA: The interaction term is not equal to 0\n")
## HA: The interaction term is not equal to 0
cat(" (The association DOES differ by sex)\n\n")
## (The association DOES differ by sex)
cat("F-statistic:", round(anova_interaction$F[2], 3), "\n")
## F-statistic: 0.121
cat("Degrees of freedom:", anova_interaction$Df[2], "(numerator),",
anova_interaction$Res.Df[2], "(denominator)\n")
## Degrees of freedom: 1 (numerator), 7971 (denominator)
cat("p-value:", format.pval(anova_interaction$`Pr(>F)`[2], digits = 4), "\n\n")
## p-value: 0.7277
if(anova_interaction$`Pr(>F)`[2] < 0.05) {
cat("Conclusion: Reject H0 at α = 0.05. There is statistically significant evidence\n")
cat("that the association between current smoking and mentally unhealthy days\n")
cat("differs between men and women.\n")
} else {
cat("Conclusion: Fail to reject H0 at α = 0.05. There is not sufficient evidence\n")
cat("that the association differs by sex.\n")
}
## Conclusion: Fail to reject H0 at α = 0.05. There is not sufficient evidence
## that the association differs by sex.
#Interpret Interaction Coefficients
# Get coefficients from Model B
coef_B <- coef(model_B)
# Find coefficient names
smoking_main <- names(coef_B)[grepl("smoker_current", names(coef_B))]
interaction_term <- names(coef_B)[grepl("sex.*:.*smoker", names(coef_B))]
cat("**Estimated Effect of Current Smoking on Mentally Unhealthy Days**\n\n")
## **Estimated Effect of Current Smoking on Mentally Unhealthy Days**
# Effect for men (reference category for sex is Male)
if(length(smoking_main) > 0) {
smoking_effect_men <- coef_B[smoking_main[1]]
cat("Among MEN:\n")
cat("Current smokers have", round(smoking_effect_men, 3),
ifelse(smoking_effect_men > 0, "more", "fewer"),
"mentally unhealthy days compared to non-smokers,\n")
cat("holding all other variables constant.\n\n")
}
## Among MEN:
## Current smokers have 1.13 more mentally unhealthy days compared to non-smokers,
## holding all other variables constant.
# Effect for women
if(length(interaction_term) > 0) {
smoking_effect_women <- coef_B[smoking_main[1]] + coef_B[interaction_term[1]]
cat("Among WOMEN:\n")
cat("Current smokers have", round(smoking_effect_women, 3),
ifelse(smoking_effect_women > 0, "more", "fewer"),
"mentally unhealthy days compared to non-smokers,\n")
cat("holding all other variables constant.\n\n")
cat("**Comparison:**\n")
cat("The difference in the smoking effect between sexes is",
round(abs(smoking_effect_men - smoking_effect_women), 3), "days.\n")
}
## Among WOMEN:
## Current smokers have 0.905 more mentally unhealthy days compared to non-smokers,
## holding all other variables constant.
##
## **Comparison:**
## The difference in the smoking effect between sexes is 0.225 days.
#Public Health Interpretation
cat("**Public Health Interpretation**\n\n")
## **Public Health Interpretation**
cat("This analysis examined whether the relationship between current smoking and\n")
## This analysis examined whether the relationship between current smoking and
cat("mental health differs between men and women using data from over 8,000 U.S. adults.\n")
## mental health differs between men and women using data from over 8,000 U.S. adults.
cat("After adjusting for physical health, BMI, exercise, age, income, education, and\n")
## After adjusting for physical health, BMI, exercise, age, income, education, and
cat("smoking status, we found no statistically significant evidence that the association\n")
## smoking status, we found no statistically significant evidence that the association
cat("between smoking and mentally unhealthy days varies by sex.\n\n")
## between smoking and mentally unhealthy days varies by sex.
cat("From a public health perspective, these findings suggest that smoking is associated\n")
## From a public health perspective, these findings suggest that smoking is associated
cat("with poorer mental health regardless of whether an individual is male or female.\n")
## with poorer mental health regardless of whether an individual is male or female.
cat("This has important implications for clinical practice and public health interventions.\n")
## This has important implications for clinical practice and public health interventions.
cat("First, mental health screening should be integrated into all smoking cessation\n")
## First, mental health screening should be integrated into all smoking cessation
cat("programs, as individuals who smoke may be at elevated risk for poor mental health\n")
## programs, as individuals who smoke may be at elevated risk for poor mental health
cat("regardless of their sex. Second, interventions addressing both smoking and mental\n")
## regardless of their sex. Second, interventions addressing both smoking and mental
cat("health can be designed as universal approaches rather than needing to be heavily\n")
## health can be designed as universal approaches rather than needing to be heavily
cat("sex-tailored, though other factors like age and socioeconomic status remain\n")
## sex-tailored, though other factors like age and socioeconomic status remain
cat("important considerations. Clinicians should be aware that patients who smoke may\n")
## important considerations. Clinicians should be aware that patients who smoke may
cat("benefit from concurrent mental health support, and public health campaigns should\n")
## benefit from concurrent mental health support, and public health campaigns should
cat("continue to highlight the mental health benefits of smoking cessation alongside the\n")
## continue to highlight the mental health benefits of smoking cessation alongside the
cat("well-established physical health benefits.\n")
## well-established physical health benefits.
#Part 4: Reflection
#A. Findings and Limitations
The results reveal several important determinants of mental health among U.S. adults. Physical health days showed the strongest association, with each additional physically unhealthy day corresponding to 0.31 more mentally unhealthy days (p < 0.001). Exercise was protective—individuals who did not exercise had 1.75 more mentally unhealthy days than those who exercised (p < 0.001). Higher income was also protective, with those earning $75,000 or more reporting 1.83 fewer mentally unhealthy days than those earning less than $15,000 (p < 0.001). Education and age group were also significantly associated with mental health. Notably, BMI (p = 0.061) and sex (p = 0.294) were not statistically significant in the full model, suggesting these factors may not independently predict mental health after accounting for other variables.
Several key limitations warrant discussion. First, the cross-sectional design prevents causal inference; we cannot determine whether poor mental health leads to smoking and physical inactivity or vice versa. Two specific unmeasured confounders that could bias these estimates are: (1) social support and isolation, which influence both mental health and health behaviors, and (2) chronic disease burden, which affects physical and mental health while correlating with smoking and exercise. Additionally, self-reported mental health days may be subject to recall bias, and the low R-squared (14%) indicates many important predictors remain unmeasured.
#B. From Simple to Multiple Regression
Adjusted R-squared penalizes the addition of predictors that do not meaningfully improve model fit. This distinction is especially important when adding multiple categorical predictors because each adds several dummy variables (e.g., age_group added 12 dummies), which would artificially inflate regular R-squared even without true associations. The adjusted R-squared (0.1373) was slightly lower than the regular R-squared (0.1404), reflecting this penalty.
Chunk tests are useful because individual t-tests for each dummy variable level would require multiple comparisons, increasing Type I error risk. Chunk tests provide a single answer about whether a categorical predictor as a whole contributes to the model. Both income (p = 5.07e-05) and education (p = 9.53e-07) were significant in chunk tests, confirming these socioeconomic factors collectively matter for mental health.
#C. AI Transparency I used ChatGPT to debug error messages related to variable selection and factor conversion in R. Specifically, I asked the following prompts: “Why am I getting an error that sex_raw object not found?” “How do I fix missing values causing zero analytic sample?” and “What is the correct variable name for age groups in BRFSS 2023?” I verified the AI’s suggestions by checking the BRFSS codebook documentation and testing the code step by step on a subset of the data before running the full analysis. All regression interpretations, the public health conclusion, and the reflection were written independently without AI assistance. The AI was used only for debugging syntax errors and understanding BRFSS variable naming conventions.
#Session Info
sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggeffects_2.3.2 gtsummary_2.5.0 car_3.1-5 carData_3.0-6
## [5] kableExtra_1.4.0 broom_1.0.12 haven_2.5.5 lubridate_1.9.3
## [9] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
## [13] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_4.0.2
## [17] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.10.0 insight_1.4.6
## [5] tzdb_0.4.0 vctrs_0.6.5 tools_4.4.2 generics_0.1.4
## [9] datawizard_1.3.0 pkgconfig_2.0.3 RColorBrewer_1.1-3 S7_0.2.1
## [13] gt_1.3.0 lifecycle_1.0.5 compiler_4.4.2 farver_2.1.2
## [17] textshaping_0.4.0 litedown_0.9 htmltools_0.5.8.1 sass_0.4.10
## [21] yaml_2.3.10 Formula_1.2-5 pillar_1.11.1 jquerylib_0.1.4
## [25] cachem_1.1.0 abind_1.4-8 commonmark_2.0.0 tidyselect_1.2.1
## [29] digest_0.6.37 stringi_1.8.4 labeling_0.4.3 fastmap_1.2.0
## [33] grid_4.4.2 cli_3.6.3 magrittr_2.0.3 cards_0.7.1
## [37] withr_3.0.2 scales_1.4.0 backports_1.5.0 timechange_0.3.0
## [41] rmarkdown_2.30 otel_0.2.0 hms_1.1.4 evaluate_1.0.5
## [45] knitr_1.51 viridisLite_0.4.3 markdown_2.0 rlang_1.1.7
## [49] glue_1.8.0 xml2_1.5.2 svglite_2.2.2 rstudioapi_0.18.0
## [53] jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.1 fs_1.6.6