knitr::opts_chunk$set(
  echo    = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width  = 10,
  fig.height = 6,
  cache  = FALSE
)

Setup and Data

library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(GGally)
library(car)
library(ggeffects)
library(plotly)
library(lmtest)
library(dplyr)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

Preparing the Dataset

brfss_2023 <- read_xpt(
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2023.XPT"
) |>
  clean_names()

Part 0: Data Acquisition and Preparation

brfss_A3 <- brfss_2023 |>
  mutate(
    menthlth_days = case_when(
      menthlth == 88 ~ 0,
      menthlth %in% c(77, 99) ~ NA_real_,
      menthlth >= 1 & menthlth <= 30 ~ menthlth
    ),
    physhlth_days = case_when(
      physhlth == 88 ~ 0,
      physhlth %in% c(77, 99) ~ NA_real_,
      physhlth >= 1 & physhlth <= 30 ~ physhlth
    ),
    
    
    bmi = case_when(
      bmi5 == 9999 ~ NA_real_,
      TRUE ~ bmi5 / 100
    ),
    sex = factor(case_when(
      sexvar == 1 ~ "Male",
      sexvar == 2 ~ "Female",
      TRUE ~ NA_character_
    )),
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      exerany2 %in% c(7, 9) ~ NA_character_
    )),
   age_group = factor(case_when(
      ageg5yr == 1  ~ "18–24",
      ageg5yr == 2  ~ "25–29",
      ageg5yr == 3  ~ "30–34",
      ageg5yr == 4  ~ "35–39",
      ageg5yr == 5  ~ "40–44",
      ageg5yr == 6  ~ "45–49",
      ageg5yr == 7  ~ "50–54",
      ageg5yr == 8  ~ "55–59",
      ageg5yr == 9  ~ "60–64",
      ageg5yr == 10 ~ "65–69",
      ageg5yr == 11 ~ "70–74",
      ageg5yr == 12 ~ "75–79",
      ageg5yr == 13 ~ "80+",
      ageg5yr == 14 ~ NA_character_
    )),
    
    income = factor(case_when(
      incomg1 == 1 ~ "Less than $15,000",
      incomg1 == 2 ~ "$15,000–$24,999",
      incomg1 == 3 ~ "$25,000–$34,999",
      incomg1 == 4 ~ "$35,000–$49,999",
      incomg1 == 5 ~ "$50,000–$99,999",
      incomg1 == 6 ~ "$100,000–$199,999",
      incomg1 == 7 ~ "$200,000 or more",
      incomg1 == 9 ~ NA_character_
    )),
    education = factor(case_when(
      educa %in% c(1, 2) ~ "Less than high school",
      educa == 3 ~ "High school diploma or GED",
      educa == 4 ~ "Some college or technical school",
      educa == 5 ~ "College graduate",
      educa == 6 ~ "Graduate or professional degree",
      educa == 9 ~ NA_character_
    )),
    smoking = factor(case_when(
      smoker3 == 1 ~ "Current daily smoker",
      smoker3 == 2 ~ "Current some-day smoker",
      smoker3 == 3 ~ "Former smoker",
      smoker3 == 4 ~ "Never smoker",
      smoker3 == 9 ~ NA_character_
    ))
  )

Descriptive Statistics

brfss_A3 |>
  select(menthlth_days, physhlth_days, bmi, age_group, income, sex, education, smoking, exercise) |>
  tbl_summary(
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi           ~ "Body mass index",
      age_group     ~ "Age group",
      income        ~ "Household income",
      sex           ~ "Sex",
      exercise      ~ "Any physical activity (past 30 days)", 
      education     ~ "Highest level of education completed",
      smoking       ~ "Smoking status (4 levels)"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023**")

**Table 1. Descriptive Statistics — BRFSS 2023**
Characteristic	N	N = 433,323¹
Mentally unhealthy days (past 30)	425,215	4.4 (8.3)
Physically unhealthy days (past 30)	422,538	4.5 (8.8)
Body mass index	392,788	28.5 (6.5)
Age group	425,544
18–24		26,280 (6.2%)
25–29		21,247 (5.0%)
30–34		24,803 (5.8%)
35–39		27,153 (6.4%)
40–44		28,463 (6.7%)
45–49		27,070 (6.4%)
50–54		31,291 (7.4%)
55–59		34,219 (8.0%)
60–64		41,974 (9.9%)
65–69		46,099 (11%)
70–74		43,533 (10%)
75–79		34,543 (8.1%)
80+		38,869 (9.1%)
Household income	346,700
$100,000–$199,999		76,637 (22%)
$15,000–$24,999		31,069 (9.0%)
$200,000 or more		26,770 (7.7%)
$25,000–$34,999		38,508 (11%)
$35,000–$49,999		47,502 (14%)
$50,000–$99,999		107,027 (31%)
Less than $15,000		19,187 (5.5%)
Sex	433,323
Female		229,541 (53%)
Male		203,782 (47%)
Highest level of education completed	430,998
College graduate		114,346 (27%)
Graduate or professional degree		184,867 (43%)
High school diploma or GED		16,161 (3.7%)
Less than high school		9,011 (2.1%)
Some college or technical school		106,613 (25%)
Smoking status (4 levels)	410,261
Current daily smoker		31,770 (7.7%)
Current some-day smoker		13,376 (3.3%)
Former smoker		113,134 (28%)
Never smoker		251,981 (61%)
Any physical activity (past 30 days)	432,072	325,227 (75%)
¹ Mean (SD); n (%)

#Report the number and percentage of missing observations for menthlth_days, bmi, and smoking

total_n <- nrow(brfss_A3)

# Missing summary
missing_summary <- brfss_A3 |>
  summarise(
    menthlth_miss_n = sum(is.na(menthlth_days)),
    menthlth_miss_pct = mean(is.na(menthlth_days)) * 100,
    
    bmi_miss_n = sum(is.na(bmi)),
    bmi_miss_pct = mean(is.na(bmi)) * 100,
    
    smoking_miss_n = sum(is.na(smoking)),
    smoking_miss_pct = mean(is.na(smoking)) * 100
  )
missing_summary

## # A tibble: 1 × 6
##   menthlth_miss_n menthlth_miss_pct bmi_miss_n bmi_miss_pct smoking_miss_n
##             <int>             <dbl>      <int>        <dbl>          <int>
## 1            8108              1.87      40535         9.35          23062
## # ℹ 1 more variable: smoking_miss_pct <dbl>

Analytic Sample

set.seed(1220)

brfss_analytic <- brfss_A3 |>
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
          age_group, income, education, smoking) |>
  slice_sample(n = 8000)

nrow(brfss_analytic)

## [1] 8000

brfss_analytic |>
  select(menthlth_days, physhlth_days, bmi, age_group,
         income, education, smoking, exercise, sex) |>
  tbl_summary(
    by = sex, 
    
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi           ~ "Body mass index",
      age_group     ~ "Age group",
      income        ~ "Household income",
      exercise      ~ "Any physical activity (past 30 days)", 
      education     ~ "Highest level of education completed",
      smoking       ~ "Smoking status (4 levels)"
    ),
    
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  # add_overall() |>
  bold_labels() |>
  #italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics by Sex — BRFSS 2023 Analytic Sample (n = 8,000)**")

**Table 1. Descriptive Statistics by Sex — BRFSS 2023 Analytic Sample (n = 8,000)**
Characteristic	N	Female N = 4,064¹	Male N = 3,936¹
Mentally unhealthy days (past 30)	8,000	5.1 (8.6)	3.5 (7.5)
Physically unhealthy days (past 30)	8,000	4.9 (8.9)	4.0 (8.4)
Body mass index	8,000	28.7 (7.0)	28.7 (6.0)
Age group	8,000
18–24		171 (4.2%)	235 (6.0%)
25–29		189 (4.7%)	219 (5.6%)
30–34		210 (5.2%)	253 (6.4%)
35–39		302 (7.4%)	263 (6.7%)
40–44		292 (7.2%)	290 (7.4%)
45–49		252 (6.2%)	266 (6.8%)
50–54		303 (7.5%)	305 (7.7%)
55–59		352 (8.7%)	308 (7.8%)
60–64		379 (9.3%)	408 (10%)
65–69		483 (12%)	418 (11%)
70–74		426 (10%)	382 (9.7%)
75–79		338 (8.3%)	325 (8.3%)
80+		367 (9.0%)	264 (6.7%)
Household income	8,000
$100,000–$199,999		869 (21%)	996 (25%)
$15,000–$24,999		370 (9.1%)	271 (6.9%)
$200,000 or more		238 (5.9%)	400 (10%)
$25,000–$34,999		495 (12%)	376 (9.6%)
$35,000–$49,999		585 (14%)	482 (12%)
$50,000–$99,999		1,260 (31%)	1,251 (32%)
Less than $15,000		247 (6.1%)	160 (4.1%)
Highest level of education completed	8,000
College graduate		1,120 (28%)	1,018 (26%)
Graduate or professional degree		1,896 (47%)	1,763 (45%)
High school diploma or GED		122 (3.0%)	130 (3.3%)
Less than high school		49 (1.2%)	75 (1.9%)
Some college or technical school		877 (22%)	950 (24%)
Smoking status (4 levels)	8,000
Current daily smoker		319 (7.8%)	339 (8.6%)
Current some-day smoker		117 (2.9%)	151 (3.8%)
Former smoker		1,055 (26%)	1,207 (31%)
Never smoker		2,573 (63%)	2,239 (57%)
Any physical activity (past 30 days)	8,000	3,094 (76%)	3,146 (80%)
¹ Mean (SD); n (%)

#Part 1: Multiple Linear Regression

#Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?

#Step 1: Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output

m1 <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic)

summary(m1)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                9.33197    0.67780  13.768  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexMale                                   -1.39038    0.16750  -8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25–29                            -1.05660    0.51938  -2.034 0.041950
## age_group30–34                            -1.09395    0.50646  -2.160 0.030803
## age_group35–39                            -1.81103    0.48851  -3.707 0.000211
## age_group40–44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45–49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50–54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55–59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60–64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65–69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70–74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75–79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000–$24,999                      1.86281    0.36432   5.113 3.24e-07
## income$200,000 or more                    -0.28425    0.34023  -0.835 0.403477
## income$25,000–$34,999                      1.42347    0.32097   4.435 9.33e-06
## income$35,000–$49,999                      0.95299    0.29630   3.216 0.001304
## income$50,000–$99,999                      0.44936    0.23068   1.948 0.051456
## incomeLess than $15,000                    3.49984    0.42923   8.154 4.07e-16
## educationGraduate or professional degree  -0.05310    0.21116  -0.251 0.801448
## educationHigh school diploma or GED       -1.71101    0.49969  -3.424 0.000620
## educationLess than high school            -1.79091    0.69119  -2.591 0.009585
## educationSome college or technical school -0.71412    0.23816  -2.998 0.002722
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexMale                                   ***
## exerciseYes                               ** 
## age_group25–29                            *  
## age_group30–34                            *  
## age_group35–39                            ***
## age_group40–44                            ***
## age_group45–49                            ***
## age_group50–54                            ***
## age_group55–59                            ***
## age_group60–64                            ***
## age_group65–69                            ***
## age_group70–74                            ***
## age_group75–79                            ***
## age_group80+                              ***
## income$15,000–$24,999                     ***
## income$200,000 or more                       
## income$25,000–$34,999                     ***
## income$35,000–$49,999                     ** 
## income$50,000–$99,999                     .  
## incomeLess than $15,000                   ***
## educationGraduate or professional degree     
## educationHigh school diploma or GED       ***
## educationLess than high school            ** 
## educationSome college or technical school ** 
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

#Step 2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places.

menthlth_days = 9.332 + 0.266(physhlth_days) + 0.033(bmi) - 1.390(sexMale) - 0.651(exerciseYes) - 1.057(age_group25–29) - 1.094(age_group30–34)- 1.811(age_group35–39)- 2.893(age_group40–44) - 3.056(age_group45–49)- 3.517(age_group50–54)- 4.496(age_group55–59) - 4.522(age_group60–64)- 5.578(age_group65–69)- 6.025(age_group70–74) - 6.287(age_group75–79)- 6.820(age_group80+)+ 1.863(income $15,000–$24,999)- 0.284(income $200,000 or more)+ 1.423(income $25,000–$34,999)+ 0.953(income $35,000–$49,999)+ 0.449(income $50,000–$99,999)+ 3.500(income Less than $15,000)- 0.053(education Graduate/professional)- 1.711(education High school/GED)-1.791(education Less than high school)- 0.714(education Some college)- 1.587(smoking Current some-day smoker)- 1.990(smoking Former smoker)- 2.937(smoking Never smoker)

#Step 3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant: • physhlth_days (continuous predictor) • bmi (continuous predictor) • sex: Female vs. Male (reference) • exercise: Yes vs. No (reference) • One age_group coefficient of your choice • Two income coefficients of your choice, compared to the reference category (Less than $15,000)

PHYSHLTH_DAYS: For each additonal physically unhealthy days in the past 30 days, the number of mentally unhealthy days positively increased by 0.266 days on average, holding all other variables constant. This means that poor physical health is associated with poor mental health.

BMI: For each 1-unit increase in BMI, mentally unhealthy days positively increases by 0.033 days on average, holding all other variables constant. This means that there is a small effect between higher BMI and mentally unhealthy days.

SEX (female vs. male(reference)) Females have 1.390 more mentally unhealthy days on average than males, holding all other variables constant. This means that being female is associated with more mentally unhealthy days.

EXERCISE (yes vs. no (refernce))

Indivduals who exercised in the past 30 days have 0.651 fewer mentally unhealthy days on average compared to those who do not exercise, hold all other variables constant.This means that exercise is associated with better mental health.

AGE ( 50-54 vs 18-24(reference)):

Individuals aged 50-54 have 3.517 fewer mentally unhealthy days on average compared to those aged 18-24, holding all other variables constant. This means that older individuals report better mental health than younger individuals.

INCOME (Less than $15,000 vs. $25,000–$34,999):

Individuals earning $25,000–$34,999 have about 2.08 fewer mentally unhealthy days on average compared to those earning less than $15,000, holding all other variables constant. This means that higher income individuals are less likely to experience mentally unhealthy days.

INCOME(Less than $15,000 vs. $200,000 or more):

Individuals earning $200,000 or more have about 3.78 fewer mentally unhealthy days on average compared to those earning less than $15,000, holding all other variables constant. This shows us that higher income level is associated with better mental health days. However, this relationship is not statistically significant, with a p-value greater than 0.05. There is not enough evidence to conclude that the relationship is reliable. #Step 4: Report and interpret each of the following model-level statistics: • R-squared: proportion of variance in mentally unhealthy days explained by all predictors • Adjusted R-squared: how it differs from R-squared and what it adds • Root MSE (residual standard error): average prediction error of the model • Overall F-test: state H0, F-statistic, degrees of freedom (numerator and denominator), p-value, and conclusion

R^2: The model has an R^2 of 0.1853, meaning that approximately 18.5% of the variability in mentally unhealthy days is explained by the predictors in the model. The adjusted R^2 is 0.1824, which is slightly lower because it takes into account for the number of predictors and penalizes for model complexity.

Root MSE: The residual stand error is 7.35, showing that on average the models prediction differs from the observed number of mentally unhealthy days by about 7.35 days.

Null hypothesis (H₀): All regression coefficients are equal to zero.

Alternative hypothesis (H₁): At least one regression coefficient is not equal to zero.

DF: 7970 F-statistic: The overall F-test is 62.52. p-value: p < 0.001

Conclusion: We can reject the null hypothesis and conclude that at least one predictor is significantly associated with mentally unhealthy days.

glance(m1) |>
  select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual, nobs) |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  pivot_longer(everything(), names_to = "Statistic", values_to = "Value") |>
  mutate(Statistic = dplyr::recode(Statistic,
    "r.squared"     = "R²",
    "adj.r.squared" = "Adjusted R²",
    "sigma"         = "Residual Std. Error (Root MSE)",
    "statistic"     = "F-statistic",
    "p.value"       = "p-value (overall F-test)",
    "df"            = "Model df (p)",
    "df.residual"   = "Residual df (n − p − 1)",
    "nobs"          = "n (observations)"
  )) |>
  kable(caption = " Overall Model Summary") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Overall Model Summary
Statistic	Value
R²	0.1853
Adjusted R²	0.1824
Residual Std. Error (Root MSE)	7.3516
F-statistic	62.5234
p-value (overall F-test)	0.0000
Model df (p)	29.0000
Residual df (n − p − 1)	7970.0000
n (observations)	8000.0000

#Part 2: Tests of Hypotheses (20 Points) #Step 1: Obtain Type III (partial) sums of squares for all predictors using car::Anova(model, type = “III”). Display the full output. Identify which predictors have statistically significant partial associations at α = 0.05.

Based on the Type III results, the predictors that are statistically significant with mentally unhealthy days is: physhlth_days, bmi, exercise, age_group, income, education, sex, and smoking. All predictors in the model are statistically significant with a p-value less than 0.05.

anova_type3 <- Anova(m1, type = "III")

anova_type3 |>
  as.data.frame() |>
  rownames_to_column("Source") |>
  mutate(across(where(is.numeric), ~ round(., 4))) |>
  kable(
    caption = "Type III (Partial) Sums of Squares — car::Anova()",
    col.names = c("Source", "Sum of Sq", "df", "F value", "p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Type III (Partial) Sums of Squares — car::Anova()
Source	Sum of Sq	df	F value	p-value
(Intercept)	10245.0838	1	189.5608	0.0000
physhlth_days	37622.7952	1	696.1198	0.0000
bmi	345.2408	1	6.3879	0.0115
sex	3723.8662	1	68.9012	0.0000
exercise	497.0434	1	9.1966	0.0024
age_group	30092.1774	12	46.3986	0.0000
income	4733.8943	6	14.5982	0.0000
education	1265.1504	4	5.8521	0.0001
smoking	5101.1076	3	31.4613	0.0000
Residuals	430750.0872	7970	NA	NA

#Step 2 (Chunk test for income): Test whether income collectively improves the model significantly, after accounting for all other predictors: 13.Fit a reduced model that includes all predictors except income.14.Use anova(model_reduced, model_full) to compare the two models. 15.State H0 and HA for this test clearly. 16.Report the F-statistic, degrees of freedom, and p-value. 17.State your conclusion at α = 0.05.

F-statistic: 14.5982 p-value: <0.0001 DF=6 Conclusion: We can reject the null hypothesis because the p-value is less than 0.05. We can conclude that income collectively improves the fit of the model after adjusting for physhlth_days, sleep_hrs, sex, and exercise. Income is an important predictor of mentally unhealthy days in this model.

m_reduced <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + education + smoking,
  data = brfss_analytic
)
m_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

# Chunk test
anova(m_reduced, m_full) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced(excluding income)", "Full (+ demographics)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table Chunk Test: Do demographic variables collectively add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table Chunk Test: Do demographic variables collectively add to the model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced(excluding income)	7976	435484.0	NA	NA	NA	NA
Full (+ demographics)	7970	430750.1	6	4733.894	14.5982	0

#Step 3 (Chunk test for education): Repeat the same procedure for education as a group of predictors.

F-statistic: 5.8521 p-value: 0.0001 DF=4 Conclusion: We can reject the null hypothesis because the p-value is less than 0.05. We can conclude that education collectively improves the fit of the model after adjusting for physhlth_days, sleep_hrs, sex, and exercise. Education is statistically significant predictor of mentally unhealthy days in this model.

m_reduced_edu <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + smoking,
  data = brfss_analytic
)

m_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)
# Chunk test
anova(m_reduced_edu, m_full) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced(no education)", "Full (+ demographics)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table Chunk Test: Do demographic variables collectively add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table Chunk Test: Do demographic variables collectively add to the model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced(no education)	7974	432015.2	NA	NA	NA	NA
Full (+ demographics)	7970	430750.1	4	1265.15	5.8521	1e-04

#Step 4: Write a 3 to 5 sentence synthesis interpreting the Type III results and your chunk test findings. Which predictors make the strongest independent contributions? What do the chunk tests add beyond the individual t-tests from summary()?

The type III results show that several predictors have strong independent associations with mentally unhealthy days, however the strongest contributions come from physically unhealthy days (f = 696.12), age group (F=46.40), and smoking (F=31.46), indicating these factors play an important role in mental health. Smaller but still significant factors are BMI, exercise, income, education, and sex.The chunk test shows that income significantly improves model fit (f-test=14.60, p < 0.001) and education also adds power (f-test=5.85, p=0.0001), showing us that both factors contribute to mental health outcomes. The chunk tests allows us to evaluate whether the full model improves overall fit rather than the reduced model.

#Part 3: Interaction Analysis (25 Points) Background: Effect modification occurs when the association between a predictor and an outcome differs across levels of another variable. You will test whether the association between current smoking and mentally unhealthy days differs between men and women. Step 1: Create a binary smoking variable called smoker_current: • “Current smoker” includes current daily smokers and current some-day smokers. • “Non-smoker” includes former smokers and never smokers. Use “Non-smoker” as the reference category.

brfss_recoded <- brfss_analytic |>
  mutate(smoker_current = case_when(
    smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
    smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker",
    TRUE ~ NA_character_
  )) |>
  mutate(smoker_current = factor(smoker_current, levels = c("Non-smoker", "Current smoker")))

#Step 2: Fit two models: • Model A (main effects only): regress menthlth_days on physhlth_days, bmi, sex, smoker_current, exercise, age_group, income, and education. • Model B (with interaction): same as Model A, but use sex * smoker_current in the formula to include the interaction term.

brfss_recoded$sex <- relevel(brfss_recoded$sex, ref = "Male")

model_A <- lm(menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education,
              data = brfss_recoded)

tidy(model_A, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Interaction Model: Sex + smoker_current → Physical Health Days",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Interaction Model: Sex + smoker_current → Physical Health Days
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	5.1228	0.6105	8.3907	0.0000	3.9260	6.3197
physhlth_days	0.2686	0.0101	26.6731	0.0000	0.2489	0.2884
bmi	0.0334	0.0132	2.5250	0.0116	0.0075	0.0593
sexFemale	1.3331	0.1673	7.9675	0.0000	1.0051	1.6611
smoker_currentCurrent smoker	2.1287	0.2712	7.8489	0.0000	1.5971	2.6604
exerciseYes	-0.6725	0.2150	-3.1274	0.0018	-1.0940	-0.2510
age_group25–29	-0.9149	0.5198	-1.7602	0.0784	-1.9338	0.1040
age_group30–34	-0.8823	0.5061	-1.7434	0.0813	-1.8743	0.1097
age_group35–39	-1.5810	0.4877	-3.2421	0.0012	-2.5369	-0.6251
age_group40–44	-2.6157	0.4858	-5.3847	0.0000	-3.5680	-1.6635
age_group45–49	-2.8246	0.4970	-5.6836	0.0000	-3.7988	-1.8504
age_group50–54	-3.2600	0.4821	-6.7628	0.0000	-4.2050	-2.3151
age_group55–59	-4.2301	0.4741	-8.9219	0.0000	-5.1595	-3.3007
age_group60–64	-4.2484	0.4568	-9.3000	0.0000	-5.1439	-3.3529
age_group65–69	-5.2338	0.4467	-11.7163	0.0000	-6.1095	-4.3582
age_group70–74	-5.7023	0.4545	-12.5457	0.0000	-6.5933	-4.8113
age_group75–79	-5.8977	0.4703	-12.5401	0.0000	-6.8197	-4.9758
age_group80+	-6.4888	0.4737	-13.6975	0.0000	-7.4174	-5.5602
income$15,000–$24,999	1.8562	0.3650	5.0855	0.0000	1.1407	2.5717
income$200,000 or more	-0.3265	0.3408	-0.9581	0.3380	-0.9946	0.3415
income$25,000–$34,999	1.4337	0.3214	4.4605	0.0000	0.8036	2.0637
income$35,000–$49,999	0.9491	0.2969	3.1971	0.0014	0.3672	1.5310
income$50,000–$99,999	0.4537	0.2311	1.9632	0.0497	0.0007	0.9067
incomeLess than $15,000	3.5360	0.4300	8.2232	0.0000	2.6931	4.3789
educationGraduate or professional degree	-0.1579	0.2106	-0.7496	0.4535	-0.5706	0.2549
educationHigh school diploma or GED	-1.6896	0.5006	-3.3750	0.0007	-2.6710	-0.7083
educationLess than high school	-1.9035	0.6922	-2.7499	0.0060	-3.2604	-0.5466
educationSome college or technical school	-0.7070	0.2385	-2.9636	0.0030	-1.1746	-0.2393

model_B <- lm(menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education,
              data = brfss_recoded)

tidy(model_B, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Interaction Model: Sex * smoker_current → Physical Health Days",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Interaction Model: Sex * smoker_current → Physical Health Days
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	5.2075	0.6113	8.5189	0.0000	4.0092	6.4058
physhlth_days	0.2686	0.0101	26.6788	0.0000	0.2489	0.2883
bmi	0.0331	0.0132	2.5017	0.0124	0.0072	0.0590
sexFemale	1.1855	0.1775	6.6784	0.0000	0.8376	1.5335
smoker_currentCurrent smoker	1.5208	0.3654	4.1621	0.0000	0.8045	2.2371
exerciseYes	-0.6728	0.2150	-3.1301	0.0018	-1.0942	-0.2515
age_group25–29	-0.9202	0.5196	-1.7709	0.0766	-1.9388	0.0984
age_group30–34	-0.8924	0.5059	-1.7640	0.0778	-1.8841	0.0993
age_group35–39	-1.5929	0.4875	-3.2673	0.0011	-2.5485	-0.6372
age_group40–44	-2.6286	0.4856	-5.4127	0.0000	-3.5806	-1.6766
age_group45–49	-2.8425	0.4969	-5.7209	0.0000	-3.8165	-1.8686
age_group50–54	-3.2778	0.4820	-6.8012	0.0000	-4.2226	-2.3331
age_group55–59	-4.2499	0.4740	-8.9652	0.0000	-5.1791	-3.3206
age_group60–64	-4.2640	0.4567	-9.3364	0.0000	-5.1593	-3.3688
age_group65–69	-5.2506	0.4466	-11.7563	0.0000	-6.1261	-4.3751
age_group70–74	-5.7111	0.4544	-12.5686	0.0000	-6.6018	-4.8203
age_group75–79	-5.9076	0.4702	-12.5646	0.0000	-6.8292	-4.9859
age_group80+	-6.4995	0.4736	-13.7239	0.0000	-7.4278	-5.5711
income$15,000–$24,999	1.8740	0.3650	5.1348	0.0000	1.1586	2.5894
income$200,000 or more	-0.3307	0.3407	-0.9708	0.3317	-0.9986	0.3371
income$25,000–$34,999	1.4352	0.3213	4.4666	0.0000	0.8053	2.0650
income$35,000–$49,999	0.9642	0.2968	3.2485	0.0012	0.3824	1.5461
income$50,000–$99,999	0.4667	0.2311	2.0198	0.0434	0.0138	0.9197
incomeLess than $15,000	3.5097	0.4300	8.1623	0.0000	2.6668	4.3526
educationGraduate or professional degree	-0.1488	0.2105	-0.7069	0.4797	-0.5615	0.2639
educationHigh school diploma or GED	-1.6923	0.5005	-3.3815	0.0007	-2.6734	-0.7113
educationLess than high school	-1.8179	0.6928	-2.6239	0.0087	-3.1760	-0.4598
educationSome college or technical school	-0.7000	0.2385	-2.9351	0.0033	-1.1675	-0.2325
sexFemale:smoker_currentCurrent smoker	1.2833	0.5171	2.4819	0.0131	0.2697	2.2968

#Step 3: Test whether the interaction is statistically significant: 18.Use anova(model_A, model_B) to compare the two models. 19.State H0 and HA for this test. 20.Report the F-statistic, degrees of freedom, and p-value. 21.State your conclusion at α = 0.05.

H₀: There is no interaction between sex and smoking. Hₐ: There is an interaction between sex and smoking.

F-statistic: 6.16 Degrees of freedom: 1 p-value: 0.0131

Conclusion: The p-value is less than 0.05, we can reject the null hypothesis.This shows us that there is an association between smoking and mentally unhealthy days and differ by sex, showing a statistically significant interaction.

anova(model_A, model_B) |>
  as.data.frame() |>
  rownames_to_column("Model") |>
  mutate(
    Model = c("Reduced (no interaction)", "Full (with sex × smoking)"),
    across(where(is.numeric), ~ round(., 4))
  ) |>
  kable(
    caption = "Table 6. Partial F-Test: Does the sex × smoking interaction improve model fit?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 6. Partial F-Test: Does the sex × smoking interaction improve model fit?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced (no interaction)	7972	432508.9	NA	NA	NA	NA
Full (with sex × smoking)	7971	432174.9	1	333.9749	6.1598	0.0131

#Step 4: Interpret the interaction using the coefficients from Model B: 22.Report the estimated effect of current smoking on mentally unhealthy days among men (the coefficient for smoker_current). Interpret in plain language.

Among men, current smokers have 1.5208 more mentally unhealthy days on average compared to non-smokers, holding all other variables constant.

23.Compute and report the estimated effect among women (smoker_current coefficient plus the interaction coefficient). Interpret in plain language.

Among women, current smokers have 1.2833 more mentally unhealthy days on average compared to non-smokers, holding all other variables constant.

24.In 2 to 3 sentences, explain what these estimates mean together: is smoking more strongly associated with poor mental health in one sex than the other, and by how much?

Smoking is associated with more mentally unhealthy days in both men and women. However the effect of smoking is weaker among women than men by about 0.24 days. This suggests that while smoking is harmful for mental health in both groups, its impact is stronger in men.

#Step 5: Visualize the interaction using ggeffects::ggpredict(model_B, terms = c(“smoker_current”, “sex”)). The plot should show predicted mentally unhealthy days for each combination of smoking status and sex, with labeled axes, a descriptive title, and a legend identifying men and women.

brfss_recoded <- ggpredict(model_B, terms = c("smoker_current", "sex"))
plot(brfss_recoded) + labs(
  x= "Smoking Status", 
  y= "Mentally Unhealthy Days",
  title = "Interaction Between Smoking Status and Sex on Mental Health"
  
)

#Step 6: Write 3 to 5 sentences interpreting the interaction for a general public health audience. Do not include coefficient values or p-values. Focus on the substantive finding and its implications.

Smoking is associated with worse mental health with current smokers reporting more mentally unhealthy days than non-smokers. The relationship differs between men and women, showing that sex plays a role in how smoking impacts mental health. The results show the importance of looking at smoking and mental health by sex and by looking at both behaviors we could improve overall health.

#Part 4: Reflection (15 Points) Write a focused reflection of 250 to 400 words in continuous prose (not as a bulleted list) addressing all three topics below.

A. Findings and Limitations (6 points) What do the results suggest about the determinants of mental health among U.S. adults? Which predictors were most strongly associated? Which were not statistically significant? What are the key limitations of using cross-sectional survey data? Name at least two specific confounders that could bias the associations you estimated.

B. From Simple to Multiple Regression (4 points) What does Adjusted R-squared tell you that regular R-squared does not, and why is this distinction especially important when adding multiple categorical predictors? Why is it useful to test groups of predictors (income, education) collectively with chunk tests, rather than relying only on the individual t-test for each level?

C. AI Transparency (5 points) Describe which parts of the assignment (if any) you sought AI assistance for, what specific prompts you used, and how you verified the AI-assisted output was correct. If you did not use AI, state that explicitly.

The results suggest that mental health among adults is influenced by a combination of physical health, demographic, and behavioral factors. The largest predictor was physically unhealthy days, which showed a large association with mentally unhealthy days. Age also showed a strong pattern, with older individuals reporting fewer mental health days compared to younger adults. Smoking also showed an association with worse mental health and showed differences among men and women. BMI and exercise had a statistically significant effect but it was very small. All predictors were significant but some were stronger than others. There are important limitations to these findings. The data is cross-sectional which means we cannot establish causality. For example, poor mental health could lead to less physical activity rather than less physical activity leading to poor mental health. Additionally, unmeasured confounding may bias the results. These confounders could be healthcare and social support which may affect mental health. In addition, self-reported data may also arise measurement error. In regards to adjusted R^2, it provides a more accurate measure of model fit than R^2 because it accounts for the number of predictors in the model. Adjusted R^2 penalizes unnecessary complexity while R^2 does not, making adjusted R^2 essential when including predictors. This helps prevent over fitting and ensures that predictors being added are improving the model fit. Testing groups of predictors using chunk tests is useful because it evaluates whether a variable contributes to the model as a whole. Individual tests may fail to detect significance in the whole model if effects are spread across categories. Chunk tests can reveal the overall importance of a variable and helps shows it role in the model. For this assignment, I used AI to help debug by code. The code I used was from the lecture notes but when changing variables or model names, I encounter errors. I would input my code into AI to help determine where my error was. For example, in part 3 section 2 I had tidy(mod_int,conf.int = TRUE)realize I had not changed it to match the name of the model I was using which was model_B so I changed the code to (model_B, conf.int = TRUE). Since my code chunk came from the notes, I knew the code was mostly right, I just had an error with variable names,model names or adding commons/plus signs in certain areas, so I was able to check if the code was correct by running it again and getting the output with no errors.

Assignment 3

Grace Beal

2026-03-30

Setup and Data

Preparing the Dataset

Part 0: Data Acquisition and Preparation

Descriptive Statistics

Analytic Sample