#Part 0: Data Acquisition and Preparation

#Load Required Packages

#Import Raw BRFSS Data

brfss_raw <- read_xpt("D:/fizza documents/Epi 553/Assignments/HW03/LLCP2023.XPT")

# Report dimensions
total_rows <- nrow(brfss_raw)
total_cols <- ncol(brfss_raw)
cat("Total rows in raw dataset:", total_rows, "\n")
## Total rows in raw dataset: 433323
cat("Total columns in raw dataset:", total_cols, "\n")
## Total columns in raw dataset: 350

#Select Required Variables

brfss_selected <- brfss_raw |> 
  select(
    menthlth_days = MENTHLTH,
    physhlth_days = PHYSHLTH,
    bmi = `_BMI5`,
    sex = SEXVAR,            # SEXVAR: 1=Male, 2=Female
    exercise = EXERANY2,
    age_group = `_AGEG5YR`,
    income = INCOME3,
    education = EDUCA,
    smoking = SMOKDAY2
  )

glimpse(brfss_selected)
## Rows: 433,323
## Columns: 9
## $ menthlth_days <dbl> 88, 88, 2, 88, 88, 3, 77, 88, 88, 88, 88, 88, 88, 88, 88…
## $ physhlth_days <dbl> 88, 88, 6, 2, 88, 2, 8, 1, 5, 88, 88, 2, 88, 88, 88, 4, …
## $ bmi           <dbl> 3047, 2856, 2231, 2744, 2585, 3018, 2441, 2727, 3347, 22…
## $ sex           <dbl> 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 2,…
## $ exercise      <dbl> 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 7, 2, 2,…
## $ age_group     <dbl> 13, 13, 13, 12, 12, 9, 13, 12, 13, 12, 8, 12, 10, 12, 12…
## $ income        <dbl> 99, 99, 2, 99, 7, 7, 6, 99, 6, 7, 7, 99, 9, 9, 9, 3, 77,…
## $ education     <dbl> 5, 5, 4, 5, 5, 5, 4, 5, 5, 4, 5, 4, 6, 5, 5, 4, 6, 6, 5,…
## $ smoking       <dbl> NA, NA, 3, NA, NA, NA, 3, NA, NA, 3, NA, NA, 3, 3, 3, NA…

#Recode All Variables

brfss_clean <- brfss_selected |> 
  mutate(
    # Clean outcome
    menthlth_days = ifelse(menthlth_days %in% c(88, 99), NA, menthlth_days),
    
    # Clean physical health days
    physhlth_days = ifelse(physhlth_days %in% c(88, 99), NA, physhlth_days),
    
    # Clean BMI (divide by 100, remove 9999)
    bmi = ifelse(bmi == 9999, NA, bmi / 100),
    
    # Convert to factors with labels
    sex = factor(sex, levels = c(1, 2), labels = c("Male", "Female")),
    exercise = factor(exercise, levels = c(1, 2), labels = c("Yes", "No")),
    age_group = factor(age_group,
      levels = 1:13,
      labels = c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49", 
                 "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+")),
    income = factor(income,
      levels = 1:6,
      labels = c("Less than $15,000", "$15,000-$24,999", "$25,000-$34,999",
                 "$35,000-$49,999", "$50,000-$74,999", "$75,000 or more")),
    education = factor(education,
      levels = 1:6,
      labels = c("Never attended", "Grades 1-8", "Grades 9-11",
                 "Grade 12 or GED", "College 1-3 years", "College 4+ years")),
    smoking = factor(smoking,
      levels = 1:4,
      labels = c("Current daily", "Current some days", "Former", "Never"))
  ) |> 
  # Remove rows with missing income (codes 77, 99 become NA after factor conversion)
  filter(!is.na(income))

cat("Rows after cleaning:", nrow(brfss_clean), "\n")
## Rows after cleaning: 136266

#Missing Data Report

data.frame(
  Variable = c("menthlth_days", "bmi", "physhlth_days", "exercise"),
  Missing_N = c(
    sum(is.na(brfss_clean$menthlth_days)),
    sum(is.na(brfss_clean$bmi)),
    sum(is.na(brfss_clean$physhlth_days)),
    sum(is.na(brfss_clean$exercise))
  ),
  Missing_Pct = c(
    mean(is.na(brfss_clean$menthlth_days)) * 100,
    mean(is.na(brfss_clean$bmi)) * 100,
    mean(is.na(brfss_clean$physhlth_days)) * 100,
    mean(is.na(brfss_clean$exercise)) * 100
  )
) |> kable(caption = "Missing Data") |> kable_styling()
Missing Data
Variable Missing_N Missing_Pct
menthlth_days 75484 55.3945959
bmi 8585 6.3001776
physhlth_days 70471 51.7157618
exercise 413 0.3030837

#Create Analytic Sample (n=8,000)

set.seed(1220)

brfss_analytic <- brfss_clean |> 
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise, 
          age_group, income, education, smoking) |> 
  slice_sample(n = 8000)

cat("Final analytic N =", nrow(brfss_analytic), "\n")
## Final analytic N = 8000

#Descriptive Statistics

brfss_analytic |> 
  select(menthlth_days, physhlth_days, bmi, sex, exercise, 
         age_group, income, education, smoking) |> 
  tbl_summary(by = sex, statistic = all_continuous() ~ "{mean} ({sd})") |> 
  add_overall() |> 
  modify_caption("Table 1: Descriptive Statistics by Sex")
Table 1: Descriptive Statistics by Sex
Characteristic Overall
N = 8,000
1
Male
N = 3,234
1
Female
N = 4,766
1
menthlth_days 18 (15) 18 (16) 18 (14)
physhlth_days 18 (16) 19 (16) 18 (16)
bmi 30 (8) 29 (7) 30 (9)
exercise 4,370 (55%) 1,851 (57%) 2,519 (53%)
age_group


    18-24 231 (2.9%) 127 (3.9%) 104 (2.2%)
    25-29 252 (3.2%) 131 (4.1%) 121 (2.5%)
    30-34 423 (5.3%) 184 (5.7%) 239 (5.0%)
    35-39 522 (6.5%) 207 (6.4%) 315 (6.6%)
    40-44 569 (7.1%) 230 (7.1%) 339 (7.1%)
    45-49 562 (7.0%) 199 (6.2%) 363 (7.6%)
    50-54 737 (9.2%) 265 (8.2%) 472 (9.9%)
    55-59 924 (12%) 367 (11%) 557 (12%)
    60-64 1,122 (14%) 440 (14%) 682 (14%)
    65-69 998 (12%) 410 (13%) 588 (12%)
    70-74 748 (9.4%) 308 (9.5%) 440 (9.2%)
    75-79 485 (6.1%) 178 (5.5%) 307 (6.4%)
    80+ 427 (5.3%) 188 (5.8%) 239 (5.0%)
income


    Less than $15,000 781 (9.8%) 319 (9.9%) 462 (9.7%)
    $15,000-$24,999 954 (12%) 374 (12%) 580 (12%)
    $25,000-$34,999 956 (12%) 379 (12%) 577 (12%)
    $35,000-$49,999 1,223 (15%) 446 (14%) 777 (16%)
    $50,000-$74,999 2,021 (25%) 823 (25%) 1,198 (25%)
    $75,000 or more 2,065 (26%) 893 (28%) 1,172 (25%)
education


    Never attended 8 (0.1%) 8 (0.2%) 0 (0%)
    Grades 1-8 211 (2.6%) 137 (4.2%) 74 (1.6%)
    Grades 9-11 767 (9.6%) 348 (11%) 419 (8.8%)
    Grade 12 or GED 2,966 (37%) 1,263 (39%) 1,703 (36%)
    College 1-3 years 2,773 (35%) 1,005 (31%) 1,768 (37%)
    College 4+ years 1,275 (16%) 473 (15%) 802 (17%)
smoking


    Current daily 2,597 (32%) 1,043 (32%) 1,554 (33%)
    Current some days 1,038 (13%) 430 (13%) 608 (13%)
    Former 4,365 (55%) 1,761 (54%) 2,604 (55%)
    Never 0 (0%) 0 (0%) 0 (0%)
1 Mean (SD); n (%)

#Part 1: Multiple Linear Regression

model_full <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + 
                   age_group + income + education + smoking,
                 data = brfss_analytic)

summary(model_full)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -37.033  -9.430  -2.114   8.035  68.023 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                16.71175    5.12664   3.260 0.001120 ** 
## physhlth_days               0.31112    0.01029  30.243  < 2e-16 ***
## bmi                        -0.03723    0.01986  -1.875 0.060891 .  
## sexFemale                  -0.34181    0.32546  -1.050 0.293640    
## exerciseNo                  1.75414    0.33220   5.280 1.32e-07 ***
## age_group25-29              0.74817    1.28744   0.581 0.561169    
## age_group30-34             -0.42361    1.16038  -0.365 0.715078    
## age_group35-39             -0.10379    1.12418  -0.092 0.926441    
## age_group40-44             -1.50683    1.11181  -1.355 0.175362    
## age_group45-49             -0.22875    1.12162  -0.204 0.838398    
## age_group50-54             -2.50934    1.08438  -2.314 0.020688 *  
## age_group55-59             -3.71878    1.05746  -3.517 0.000439 ***
## age_group60-64             -3.27140    1.03844  -3.150 0.001637 ** 
## age_group65-69             -3.21318    1.04762  -3.067 0.002169 ** 
## age_group70-74             -5.51356    1.08231  -5.094 3.58e-07 ***
## age_group75-79             -4.31956    1.14847  -3.761 0.000170 ***
## age_group80+               -3.71461    1.17848  -3.152 0.001627 ** 
## income$15,000-$24,999       0.80752    0.68585   1.177 0.239072    
## income$25,000-$34,999      -0.53927    0.68761  -0.784 0.432911    
## income$35,000-$49,999      -0.98112    0.65694  -1.493 0.135354    
## income$50,000-$74,999      -1.47223    0.60862  -2.419 0.015588 *  
## income$75,000 or more      -1.82754    0.61662  -2.964 0.003048 ** 
## educationGrades 1-8         3.66358    5.07996   0.721 0.470819    
## educationGrades 9-11        2.74554    5.01490   0.547 0.584067    
## educationGrade 12 or GED    0.75134    4.99723   0.150 0.880491    
## educationCollege 1-3 years  0.15661    4.99880   0.031 0.975008    
## educationCollege 4+ years  -0.52887    5.00886  -0.106 0.915912    
## smokingCurrent some days   -1.42597    0.52057  -2.739 0.006172 ** 
## smokingFormer              -1.40730    0.36878  -3.816 0.000137 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.08 on 7971 degrees of freedom
## Multiple R-squared:  0.1404, Adjusted R-squared:  0.1373 
## F-statistic: 46.48 on 28 and 7971 DF,  p-value: < 2.2e-16

#Fitted Regression Equation

# Extract all coefficients rounded to 3 decimal places
coefs <- round(coef(model_full), 3)

# Display full equation
cat("**Fitted Regression Equation:**\n\n")
## **Fitted Regression Equation:**
cat("Predicted Mentally Unhealthy Days = ", coefs[1])
## Predicted Mentally Unhealthy Days =  16.712
# Loop through all coefficients (except intercept)
for(i in 2:length(coefs)) {
  cat(" + ", coefs[i], "×", names(coefs)[i])
}
##  +  0.311 × physhlth_days +  -0.037 × bmi +  -0.342 × sexFemale +  1.754 × exerciseNo +  0.748 × age_group25-29 +  -0.424 × age_group30-34 +  -0.104 × age_group35-39 +  -1.507 × age_group40-44 +  -0.229 × age_group45-49 +  -2.509 × age_group50-54 +  -3.719 × age_group55-59 +  -3.271 × age_group60-64 +  -3.213 × age_group65-69 +  -5.514 × age_group70-74 +  -4.32 × age_group75-79 +  -3.715 × age_group80+ +  0.808 × income$15,000-$24,999 +  -0.539 × income$25,000-$34,999 +  -0.981 × income$35,000-$49,999 +  -1.472 × income$50,000-$74,999 +  -1.828 × income$75,000 or more +  3.664 × educationGrades 1-8 +  2.746 × educationGrades 9-11 +  0.751 × educationGrade 12 or GED +  0.157 × educationCollege 1-3 years +  -0.529 × educationCollege 4+ years +  -1.426 × smokingCurrent some days +  -1.407 × smokingFormer
cat("\n")

#Interpret Selected Coefficients

cat("**Coefficient Interpretations (holding all other variables constant):**\n\n")
## **Coefficient Interpretations (holding all other variables constant):**
# Get coefficients
coef_physhlth <- coef(model_full)["physhlth_days"]
coef_bmi <- coef(model_full)["bmi"]

cat("**physhlth_days (continuous):**\n")
## **physhlth_days (continuous):**
cat("For each additional day of physically unhealthy days, the number of\n")
## For each additional day of physically unhealthy days, the number of
cat("mentally unhealthy days is estimated to change by", round(coef_physhlth, 3), "days.\n\n")
## mentally unhealthy days is estimated to change by 0.311 days.
cat("**bmi (continuous):**\n")
## **bmi (continuous):**
cat("For each one-unit increase in BMI, the number of mentally unhealthy days\n")
## For each one-unit increase in BMI, the number of mentally unhealthy days
cat("is estimated to", ifelse(coef_bmi > 0, "increase by", "decrease by"),
    abs(round(coef_bmi, 3)), "days.\n\n")
## is estimated to decrease by 0.037 days.
# Sex coefficient
if("sexFemale" %in% names(coef(model_full))) {
  coef_sex <- coef(model_full)["sexFemale"]
  cat("**sex: Female vs. Male (reference):**\n")
  cat("Females have", abs(round(coef_sex, 3)), 
      ifelse(coef_sex > 0, "more", "fewer"),
      "mentally unhealthy days compared to males.\n\n")
}
## **sex: Female vs. Male (reference):**
## Females have 0.342 fewer mentally unhealthy days compared to males.
# Exercise coefficient
if("exerciseYes" %in% names(coef(model_full))) {
  coef_exercise <- coef(model_full)["exerciseYes"]
  cat("**exercise: Yes vs. No (reference):**\n")
  cat("Individuals who exercise have", abs(round(coef_exercise, 3)),
      ifelse(coef_exercise > 0, "more", "fewer"),
      "mentally unhealthy days compared to those who do not exercise.\n\n")
}

# Age group example - pick a significant one
age_coefs <- names(coef(model_full))[grepl("age_group", names(coef(model_full)))]
if(length(age_coefs) > 0) {
  cat("**Age group (", age_coefs[1], " vs. 18-24 reference):**\n")
  cat("This age group has", abs(round(coef(model_full)[age_coefs[1]], 3)),
      ifelse(coef(model_full)[age_coefs[1]] > 0, "more", "fewer"),
      "mentally unhealthy days compared to adults aged 18-24.\n\n")
}
## **Age group ( age_group25-29  vs. 18-24 reference):**
## This age group has 0.748 more mentally unhealthy days compared to adults aged 18-24.
# Income coefficients
if("income$50,000-$74,999" %in% names(coef(model_full))) {
  cat("**Income ($50,000-$74,999 vs. Less than $15,000 reference):**\n")
  cat("Adults earning $50,000-$74,999 have", 
      abs(round(coef(model_full)["income$50,000-$74,999"], 3)),
      ifelse(coef(model_full)["income$50,000-$74,999"] > 0, "more", "fewer"),
      "mentally unhealthy days compared to those earning less than $15,000.\n\n")
}
## **Income ($50,000-$74,999 vs. Less than $15,000 reference):**
## Adults earning $50,000-$74,999 have 1.472 fewer mentally unhealthy days compared to those earning less than $15,000.
if("income$75,000 or more" %in% names(coef(model_full))) {
  cat("**Income ($75,000 or more vs. Less than $15,000 reference):**\n")
  cat("Adults earning $75,000 or more have", 
      abs(round(coef(model_full)["income$75,000 or more"], 3)),
      ifelse(coef(model_full)["income$75,000 or more"] > 0, "more", "fewer"),
      "mentally unhealthy days compared to those earning less than $15,000.\n\n")
}
## **Income ($75,000 or more vs. Less than $15,000 reference):**
## Adults earning $75,000 or more have 1.828 fewer mentally unhealthy days compared to those earning less than $15,000.

#Report and Interpret Model-Level Statistics

r2 <- summary(model_full)$r.squared
adj_r2 <- summary(model_full)$adj.r.squared
rse <- summary(model_full)$sigma
f_stat <- summary(model_full)$fstatistic
f_pvalue <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)

cat("**Model Summary Statistics:**\n\n")
## **Model Summary Statistics:**
cat("**R-squared:**", round(r2, 4), "\n")
## **R-squared:** 0.1404
cat("-", round(r2 * 100, 1), "% of the variance in mentally unhealthy days is explained by all predictors.\n\n")
## - 14 % of the variance in mentally unhealthy days is explained by all predictors.
cat("**Adjusted R-squared:**", round(adj_r2, 4), "\n")
## **Adjusted R-squared:** 0.1373
cat("- This penalizes the R-squared for the number of predictors, providing a more conservative estimate.\n\n")
## - This penalizes the R-squared for the number of predictors, providing a more conservative estimate.
cat("**Root MSE (Residual Standard Error):**", round(rse, 3), "\n")
## **Root MSE (Residual Standard Error):** 14.082
cat("- On average, the model's predictions are off by about", round(rse, 2), "mentally unhealthy days.\n\n")
## - On average, the model's predictions are off by about 14.08 mentally unhealthy days.
cat("\n**Overall F-test:**\n")
## 
## **Overall F-test:**
cat("H0: All regression coefficients (except intercept) equal 0\n")
## H0: All regression coefficients (except intercept) equal 0
cat("HA: At least one regression coefficient is not equal to 0\n")
## HA: At least one regression coefficient is not equal to 0
f_stat <- summary(model_full)$fstatistic
f_pvalue <- pf(f_stat[1], f_stat[2], f_stat[3], lower.tail = FALSE)
cat("F-statistic:", round(f_stat[1], 2), "\n")
## F-statistic: 46.48
cat("Degrees of freedom:", f_stat[2], "(numerator),", f_stat[3], "(denominator)\n")
## Degrees of freedom: 28 (numerator), 7971 (denominator)
cat("p-value:", format.pval(f_pvalue, digits = 4), "\n")
## p-value: < 2.2e-16
cat("Conclusion: Reject H0. The model with all predictors significantly improves prediction.\n")
## Conclusion: Reject H0. The model with all predictors significantly improves prediction.

#Part 2: Tests of Hypothesis #Type III Sums of Squares

model_type3 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + 
                    age_group + income + education + smoking,
                  data = brfss_analytic,
                  contrasts = list(sex = contr.sum, exercise = contr.sum,
                                   age_group = contr.sum, income = contr.sum,
                                   education = contr.sum, smoking = contr.sum))

Anova(model_type3, type = "III")
## Anova Table (Type III tests)
## 
## Response: menthlth_days
##                Sum Sq   Df  F value    Pr(>F)    
## (Intercept)     38579    1 194.5567 < 2.2e-16 ***
## physhlth_days  181358    1 914.6133 < 2.2e-16 ***
## bmi               697    1   3.5139 0.0608910 .  
## sex               219    1   1.1030 0.2936395    
## exercise         5529    1  27.8830 1.323e-07 ***
## age_group       21701   12   9.1202 < 2.2e-16 ***
## income           5414    5   5.4612 5.066e-05 ***
## education        7152    5   7.2133 9.533e-07 ***
## smoking          3222    2   8.1253 0.0002984 ***
## Residuals     1580566 7971                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Chunk Tests

# Chunk test for income
cat("\n--- Chunk Test: Income ---\n")
## 
## --- Chunk Test: Income ---
cat("H0: All income coefficients are simultaneously equal to 0\n")
## H0: All income coefficients are simultaneously equal to 0
cat("HA: At least one income coefficient is not equal to 0\n\n")
## HA: At least one income coefficient is not equal to 0
model_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + 
                        age_group + education + smoking, data = brfss_analytic)
anova_income <- anova(model_no_income, model_full)
cat("F-statistic:", round(anova_income$F[2], 2), "\n")
## F-statistic: 5.46
cat("Degrees of freedom:", anova_income$Df[2], "(numerator),", 
    anova_income$Res.Df[2], "(denominator)\n")
## Degrees of freedom: 5 (numerator), 7971 (denominator)
cat("p-value:", format.pval(anova_income$`Pr(>F)`[2], digits = 4), "\n")
## p-value: 5.066e-05
if(anova_income$`Pr(>F)`[2] < 0.05) {
  cat("Conclusion: Reject H0. Income collectively improves the model significantly.\n")
} else {
  cat("Conclusion: Fail to reject H0. Income does not significantly improve the model.\n")
}
## Conclusion: Reject H0. Income collectively improves the model significantly.
# Chunk test for education
cat("\n--- Chunk Test: Education ---\n")
## 
## --- Chunk Test: Education ---
cat("H0: All education coefficients are simultaneously equal to 0\n")
## H0: All education coefficients are simultaneously equal to 0
cat("HA: At least one education coefficient is not equal to 0\n\n")
## HA: At least one education coefficient is not equal to 0
model_no_education <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + 
                           age_group + income + smoking, data = brfss_analytic)
anova_education <- anova(model_no_education, model_full)
cat("F-statistic:", round(anova_education$F[2], 2), "\n")
## F-statistic: 7.21
cat("Degrees of freedom:", anova_education$Df[2], "(numerator),", 
    anova_education$Res.Df[2], "(denominator)\n")
## Degrees of freedom: 5 (numerator), 7971 (denominator)
cat("p-value:", format.pval(anova_education$`Pr(>F)`[2], digits = 4), "\n")
## p-value: 9.533e-07
if(anova_education$`Pr(>F)`[2] < 0.05) {
  cat("Conclusion: Reject H0. Education collectively improves the model significantly.\n")
} else {
  cat("Conclusion: Fail to reject H0. Education does not significantly improve the model.\n")
}
## Conclusion: Reject H0. Education collectively improves the model significantly.

#Written Synthesis

cat("**Synthesis of Type III and Chunk Test Findings**\n\n")
## **Synthesis of Type III and Chunk Test Findings**
cat("The Type III partial F-tests show which predictors have unique associations with\n")
## The Type III partial F-tests show which predictors have unique associations with
cat("mentally unhealthy days after accounting for all other variables. The chunk tests\n")
## mentally unhealthy days after accounting for all other variables. The chunk tests
cat("complement this by testing whether income and education as complete groups\n")
## complement this by testing whether income and education as complete groups
cat("contribute meaningfully to the model.\n\n")
## contribute meaningfully to the model.
cat("These chunk tests add value beyond individual t-tests because they assess the\n")
## These chunk tests add value beyond individual t-tests because they assess the
cat("joint contribution of all dummy variables representing a categorical predictor,\n")
## joint contribution of all dummy variables representing a categorical predictor,
cat("avoiding multiple comparisons problems and providing a single answer about\n")
## avoiding multiple comparisons problems and providing a single answer about
cat("whether the predictor as a whole is associated with the outcome.\n")
## whether the predictor as a whole is associated with the outcome.

#Part 3: Interaction Analysis

cat("\n**Test of Interaction between Sex and Smoking Status**\n\n")
## 
## **Test of Interaction between Sex and Smoking Status**
cat("H0: The interaction term between sex and smoker_current equals 0\n")
## H0: The interaction term between sex and smoker_current equals 0
cat("    (The association between smoking and mental health does NOT differ by sex)\n")
##     (The association between smoking and mental health does NOT differ by sex)
cat("HA: The interaction term is not equal to 0\n")
## HA: The interaction term is not equal to 0
cat("    (The association between smoking and mental health DOES differ by sex)\n\n")
##     (The association between smoking and mental health DOES differ by sex)
# Create binary smoking variable
brfss_analytic <- brfss_analytic |> 
  mutate(smoker_current = factor(
    case_when(
      smoking %in% c("Current daily", "Current some days") ~ "Current smoker",
      TRUE ~ "Non-smoker"
    ),
    levels = c("Non-smoker", "Current smoker")
  ))

# Main effects model
model_A <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
                exercise + age_group + income + education, data = brfss_analytic)

# Interaction model
model_B <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
                exercise + age_group + income + education, data = brfss_analytic)

# Test interaction
cat("\n--- Test for Interaction ---\n")
## 
## --- Test for Interaction ---
print(anova(model_A, model_B))
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education
##   Res.Df     RSS Df Sum of Sq      F Pr(>F)
## 1   7972 1582054                           
## 2   7971 1582030  1    24.058 0.1212 0.7277
# Visualize
ggpredict(model_B, terms = c("smoker_current", "sex")) |> 
  plot() +
  labs(title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
       x = "Smoking Status", y = "Predicted Mentally Unhealthy Days") +
  theme_minimal()
## Ignoring unknown labels:
## • linetype : "sex"
## • shape : "sex"

#Test Interaction Significance

anova_interaction <- anova(model_A, model_B)

cat("**Test of Interaction between Sex and Smoking Status**\n\n")
## **Test of Interaction between Sex and Smoking Status**
cat("H0: The interaction term between sex and smoker_current equals 0\n")
## H0: The interaction term between sex and smoker_current equals 0
cat("    (The association between smoking and mental health does NOT differ by sex)\n")
##     (The association between smoking and mental health does NOT differ by sex)
cat("HA: The interaction term is not equal to 0\n")
## HA: The interaction term is not equal to 0
cat("    (The association DOES differ by sex)\n\n")
##     (The association DOES differ by sex)
cat("F-statistic:", round(anova_interaction$F[2], 3), "\n")
## F-statistic: 0.121
cat("Degrees of freedom:", anova_interaction$Df[2], "(numerator),", 
    anova_interaction$Res.Df[2], "(denominator)\n")
## Degrees of freedom: 1 (numerator), 7971 (denominator)
cat("p-value:", format.pval(anova_interaction$`Pr(>F)`[2], digits = 4), "\n\n")
## p-value: 0.7277
if(anova_interaction$`Pr(>F)`[2] < 0.05) {
  cat("Conclusion: Reject H0 at α = 0.05. There is statistically significant evidence\n")
  cat("that the association between current smoking and mentally unhealthy days\n")
  cat("differs between men and women.\n")
} else {
  cat("Conclusion: Fail to reject H0 at α = 0.05. There is not sufficient evidence\n")
  cat("that the association differs by sex.\n")
}
## Conclusion: Fail to reject H0 at α = 0.05. There is not sufficient evidence
## that the association differs by sex.

#Interpret Interaction Coefficients

 # Get coefficients from Model B
coef_B <- coef(model_B)

# Find coefficient names
smoking_main <- names(coef_B)[grepl("smoker_current", names(coef_B))]
interaction_term <- names(coef_B)[grepl("sex.*:.*smoker", names(coef_B))]

cat("**Estimated Effect of Current Smoking on Mentally Unhealthy Days**\n\n")
## **Estimated Effect of Current Smoking on Mentally Unhealthy Days**
# Effect for men (reference category for sex is Male)
if(length(smoking_main) > 0) {
  smoking_effect_men <- coef_B[smoking_main[1]]
  cat("Among MEN:\n")
  cat("Current smokers have", round(smoking_effect_men, 3), 
      ifelse(smoking_effect_men > 0, "more", "fewer"), 
      "mentally unhealthy days compared to non-smokers,\n")
  cat("holding all other variables constant.\n\n")
}
## Among MEN:
## Current smokers have 1.13 more mentally unhealthy days compared to non-smokers,
## holding all other variables constant.
# Effect for women
if(length(interaction_term) > 0) {
  smoking_effect_women <- coef_B[smoking_main[1]] + coef_B[interaction_term[1]]
  cat("Among WOMEN:\n")
  cat("Current smokers have", round(smoking_effect_women, 3),
      ifelse(smoking_effect_women > 0, "more", "fewer"),
      "mentally unhealthy days compared to non-smokers,\n")
  cat("holding all other variables constant.\n\n")
  
  cat("**Comparison:**\n")
  cat("The difference in the smoking effect between sexes is", 
      round(abs(smoking_effect_men - smoking_effect_women), 3), "days.\n")
}
## Among WOMEN:
## Current smokers have 0.905 more mentally unhealthy days compared to non-smokers,
## holding all other variables constant.
## 
## **Comparison:**
## The difference in the smoking effect between sexes is 0.225 days.

#Public Health Interpretation

cat("**Public Health Interpretation**\n\n")
## **Public Health Interpretation**
cat("This analysis examined whether the relationship between current smoking and\n")
## This analysis examined whether the relationship between current smoking and
cat("mental health differs between men and women using data from over 8,000 U.S. adults.\n")
## mental health differs between men and women using data from over 8,000 U.S. adults.
cat("After adjusting for physical health, BMI, exercise, age, income, education, and\n")
## After adjusting for physical health, BMI, exercise, age, income, education, and
cat("smoking status, we found no statistically significant evidence that the association\n")
## smoking status, we found no statistically significant evidence that the association
cat("between smoking and mentally unhealthy days varies by sex.\n\n")
## between smoking and mentally unhealthy days varies by sex.
cat("From a public health perspective, these findings suggest that smoking is associated\n")
## From a public health perspective, these findings suggest that smoking is associated
cat("with poorer mental health regardless of whether an individual is male or female.\n")
## with poorer mental health regardless of whether an individual is male or female.
cat("This has important implications for clinical practice and public health interventions.\n")
## This has important implications for clinical practice and public health interventions.
cat("First, mental health screening should be integrated into all smoking cessation\n")
## First, mental health screening should be integrated into all smoking cessation
cat("programs, as individuals who smoke may be at elevated risk for poor mental health\n")
## programs, as individuals who smoke may be at elevated risk for poor mental health
cat("regardless of their sex. Second, interventions addressing both smoking and mental\n")
## regardless of their sex. Second, interventions addressing both smoking and mental
cat("health can be designed as universal approaches rather than needing to be heavily\n")
## health can be designed as universal approaches rather than needing to be heavily
cat("sex-tailored, though other factors like age and socioeconomic status remain\n")
## sex-tailored, though other factors like age and socioeconomic status remain
cat("important considerations. Clinicians should be aware that patients who smoke may\n")
## important considerations. Clinicians should be aware that patients who smoke may
cat("benefit from concurrent mental health support, and public health campaigns should\n")
## benefit from concurrent mental health support, and public health campaigns should
cat("continue to highlight the mental health benefits of smoking cessation alongside the\n")
## continue to highlight the mental health benefits of smoking cessation alongside the
cat("well-established physical health benefits.\n")
## well-established physical health benefits.

#Part 4: Reflection

#A. Findings and Limitations

The results reveal several important determinants of mental health among U.S. adults. Physical health days showed the strongest association, with each additional physically unhealthy day corresponding to 0.31 more mentally unhealthy days (p < 0.001). Exercise was protective—individuals who did not exercise had 1.75 more mentally unhealthy days than those who exercised (p < 0.001). Higher income was also protective, with those earning $75,000 or more reporting 1.83 fewer mentally unhealthy days than those earning less than $15,000 (p < 0.001). Education and age group were also significantly associated with mental health. Notably, BMI (p = 0.061) and sex (p = 0.294) were not statistically significant in the full model, suggesting these factors may not independently predict mental health after accounting for other variables.

Several key limitations warrant discussion. First, the cross-sectional design prevents causal inference; we cannot determine whether poor mental health leads to smoking and physical inactivity or vice versa. Two specific unmeasured confounders that could bias these estimates are: (1) social support and isolation, which influence both mental health and health behaviors, and (2) chronic disease burden, which affects physical and mental health while correlating with smoking and exercise. Additionally, self-reported mental health days may be subject to recall bias, and the low R-squared (14%) indicates many important predictors remain unmeasured.

#B. From Simple to Multiple Regression

Adjusted R-squared penalizes the addition of predictors that do not meaningfully improve model fit. This distinction is especially important when adding multiple categorical predictors because each adds several dummy variables (e.g., age_group added 12 dummies), which would artificially inflate regular R-squared even without true associations. The adjusted R-squared (0.1373) was slightly lower than the regular R-squared (0.1404), reflecting this penalty.

Chunk tests are useful because individual t-tests for each dummy variable level would require multiple comparisons, increasing Type I error risk. Chunk tests provide a single answer about whether a categorical predictor as a whole contributes to the model. Both income (p = 5.07e-05) and education (p = 9.53e-07) were significant in chunk tests, confirming these socioeconomic factors collectively matter for mental health.

#C. AI Transparency I used ChatGPT to debug error messages related to variable selection and factor conversion in R. Specifically, I asked the following prompts: “Why am I getting an error that sex_raw object not found?” “How do I fix missing values causing zero analytic sample?” and “What is the correct variable name for age groups in BRFSS 2023?” I verified the AI’s suggestions by checking the BRFSS codebook documentation and testing the code step by step on a subset of the data before running the full analysis. All regression interpretations, the public health conclusion, and the reflection were written independently without AI assistance. The AI was used only for debugging syntax errors and understanding BRFSS variable naming conventions.

#Session Info

sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggeffects_2.3.2  gtsummary_2.5.0  car_3.1-5        carData_3.0-6   
##  [5] kableExtra_1.4.0 broom_1.0.12     haven_2.5.5      lubridate_1.9.3 
##  [9] forcats_1.0.0    stringr_1.5.1    dplyr_1.1.4      purrr_1.0.2     
## [13] readr_2.1.5      tidyr_1.3.1      tibble_3.2.1     ggplot2_4.0.2   
## [17] tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.56          bslib_0.10.0       insight_1.4.6     
##  [5] tzdb_0.4.0         vctrs_0.6.5        tools_4.4.2        generics_0.1.4    
##  [9] datawizard_1.3.0   pkgconfig_2.0.3    RColorBrewer_1.1-3 S7_0.2.1          
## [13] gt_1.3.0           lifecycle_1.0.5    compiler_4.4.2     farver_2.1.2      
## [17] textshaping_0.4.0  litedown_0.9       htmltools_0.5.8.1  sass_0.4.10       
## [21] yaml_2.3.10        Formula_1.2-5      pillar_1.11.1      jquerylib_0.1.4   
## [25] cachem_1.1.0       abind_1.4-8        commonmark_2.0.0   tidyselect_1.2.1  
## [29] digest_0.6.37      stringi_1.8.4      labeling_0.4.3     fastmap_1.2.0     
## [33] grid_4.4.2         cli_3.6.3          magrittr_2.0.3     cards_0.7.1       
## [37] withr_3.0.2        scales_1.4.0       backports_1.5.0    timechange_0.3.0  
## [41] rmarkdown_2.30     otel_0.2.0         hms_1.1.4          evaluate_1.0.5    
## [45] knitr_1.51         viridisLite_0.4.3  markdown_2.0       rlang_1.1.7       
## [49] glue_1.8.0         xml2_1.5.2         svglite_2.2.2      rstudioapi_0.18.0 
## [53] jsonlite_2.0.0     R6_2.6.1           systemfonts_1.3.1  fs_1.6.6