Part 0: Data Acquisition and Preparation

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(gtsummary)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(ggeffects)
library(haven)

brfss_raw <- read_xpt("data/LLCP2023.XPT")

Total observations: 433,323 Total rows : 350

Select variables of interest and create analytic dataset

brfss_subset <- brfss_raw %>%
  select(MENTHLTH, PHYSHLTH, `_BMI5`, SEXVAR, EXERANY2, `_AGEG5YR`, `_INCOMG1`, EDUCA, `_SMOKER3`)

# Recode variables
brfss_subset <- brfss_subset %>%
  mutate(
    #menthlth_days
    MENTHLTH      = if_else(MENTHLTH %in% c(77, 99), NA_real_, MENTHLTH),
    menthlth_days = case_when(
         MENTHLTH == 88 ~ 0,
         MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
         TRUE     ~  NA_real_),

    #physhlth_days
    PHYSHLTH      = if_else(PHYSHLTH %in% c(77, 99), NA_real_, PHYSHLTH),
    physhlth_days = case_when(
         PHYSHLTH == 88 ~ 0,
         PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
         TRUE     ~  NA_real_),
    
    #bmi
    bmi           = `_BMI5` / 100,
    bmi = if_else(bmi %in% c(9999), NA_real_, bmi),

    #sex
    sex           = factor(ifelse(SEXVAR == 1, "Male", "Female")),
    
    #exercise
    EXERANY2      = if_else(EXERANY2 %in% c(7, 9), NA_real_, EXERANY2),
    exercise      = factor(ifelse(EXERANY2 == 1, "Yes", "No")),

    #age_group
    `_AGEG5YR`    = if_else(`_AGEG5YR` %in% c(14), NA_real_, `_AGEG5YR`),
    age_group     = factor(case_when(
      `_AGEG5YR`  == 1 ~ "18-24",
      `_AGEG5YR`  == 2 ~ "25-29",
      `_AGEG5YR`  == 3 ~ "30-34",
      `_AGEG5YR`  == 4 ~ "35-39",
      `_AGEG5YR`  == 5 ~ "40-44",
      `_AGEG5YR`  == 6 ~ "45-49", 
      `_AGEG5YR`  == 7 ~ "50-54", 
      `_AGEG5YR`  == 8 ~ "55-59", 
      `_AGEG5YR`  == 9 ~ "60-64", 
      `_AGEG5YR`  == 10 ~ "65-69", 
      `_AGEG5YR`  == 11 ~ "70-74", 
      `_AGEG5YR`  == 12 ~ "75-79", 
      `_AGEG5YR`  == 13 ~ "80+"),
      levels      =  c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49", 
                       "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+")),
    
    #income
    `_INCOMG1`    = if_else(`_INCOMG1` %in% c(9), NA_real_, `_INCOMG1`),
    income        = factor(case_when(
       `_INCOMG1` == 1 ~ "< $15,000",
       `_INCOMG1` == 2 ~ "$15,000 to < $25,000",
       `_INCOMG1` == 3 ~ "$25,000 to < $35,000",
       `_INCOMG1` == 4 ~ "$35,000 to < $50,000",
       `_INCOMG1` == 5 ~ "$50,000 to < $100,000", 
       `_INCOMG1` == 6 ~ "$100,000 to < $200,000", 
       `_INCOMG1` == 7 ~ "$200,000 or more"), 
       levels     =  c("< $15,000", "$15,000 to < $25,000", "$25,000 to < $35,000", "$35,000 to < $50,000", 
                       "$50,000 to < $100,000", "$100,000 to < $200,000",   "$200,000 or more")),
    
    #education
    EDUCA         = if_else(EDUCA %in% c(9), NA_real_, EDUCA),
    education     = factor(case_when(
           EDUCA  == 1 | EDUCA  == 2 ~ "< High school",
           EDUCA  == 3 ~ "High school graduate or GED",
           EDUCA  == 4 ~ "Some college or technical school",
           EDUCA  == 5 ~ "College graduate", 
           EDUCA  == 6 ~ "Graduate or professional degree"), 
           levels =  c("< High school", "High school graduate or GED", 
                       "Some college or technical school", "College graduate", "Graduate or professional degree")),
    #smoking
    `_SMOKER3`    = if_else(`_SMOKER3` %in% c(9), NA_real_, `_SMOKER3`),
    smoking       = factor(case_when(
      `_SMOKER3`  == 1 ~ "Current daily smoker",
      `_SMOKER3`  == 2 ~  "Current some-day smoker",
      `_SMOKER3`  == 3 ~ "Former smoker",
      `_SMOKER3`  == 4 ~ "Never smoker"),
      levels      =  c("Current daily smoker", "Current some-day smoker", 
                       "Former smoker", "Never smoker")))

Confirm all reformatting with table function, comparing data from raw dataset and analytical dataset (code repetitive and not shown)

Check for missing values

Report the number and percentage of missing observations for menthlth_days, physhlth_days, and income

mean(is.na(brfss_subset$menthlth_days)) * 100 # 1.87%
## [1] 1.871122
mean(is.na(brfss_subset$physhlth_days)) * 100 # 2.49%
## [1] 2.488906
mean(is.na(brfss_subset$income)) * 100 # 19.99%
## [1] 19.9904

Set seed, remove missing data, and slice to create analytic dataset

set.seed(1220)
brfss_analytic <- brfss_subset %>%
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) %>%
slice_sample(n = 8000)

Save the cleaned subset for future use

write_rds(brfss_analytic, "data/brfss_subset_2023.rds")

Create table of variables by sex

data_1 <- readRDS("data/brfss_subset_2023.rds")

data_1 %>%
  tbl_summary(
  by = sex,
  include = c(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking),
  label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi           ~ "Body mass index",
      age_group     ~ "Age group in 5-year categories",
      income        ~ "Annual household income",
      exercise      ~ "Any physical activity (past 30 days)",
      education     ~ "Highest level of education completed",
      smoking       ~ "Smoking status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) %>%
  add_n() %>%
  # add_overall() %>%
  bold_labels() %>%
  #italicize_levels() %>%
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**") %>%
  as_flex_table()
**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**

Characteristic

N

Female
N = 4,0641

Male
N = 3,9361

Mentally unhealthy days (past 30)

8,000

5.1 (8.6)

3.5 (7.5)

Physically unhealthy days (past 30)

8,000

4.9 (8.9)

4.0 (8.4)

Body mass index

8,000

28.7 (7.0)

28.7 (6.0)

Any physical activity (past 30 days)

8,000

3,094 (76%)

3,146 (80%)

Age group in 5-year categories

8,000

18-24

171 (4.2%)

235 (6.0%)

25-29

189 (4.7%)

219 (5.6%)

30-34

210 (5.2%)

253 (6.4%)

35-39

302 (7.4%)

263 (6.7%)

40-44

292 (7.2%)

290 (7.4%)

45-49

252 (6.2%)

266 (6.8%)

50-54

303 (7.5%)

305 (7.7%)

55-59

352 (8.7%)

308 (7.8%)

60-64

379 (9.3%)

408 (10%)

65-69

483 (12%)

418 (11%)

70-74

426 (10%)

382 (9.7%)

75-79

338 (8.3%)

325 (8.3%)

80+

367 (9.0%)

264 (6.7%)

Annual household income

8,000

< $15,000

247 (6.1%)

160 (4.1%)

$15,000 to < $25,000

370 (9.1%)

271 (6.9%)

$25,000 to < $35,000

495 (12%)

376 (9.6%)

$35,000 to < $50,000

585 (14%)

482 (12%)

$50,000 to < $100,000

1,260 (31%)

1,251 (32%)

$100,000 to < $200,000

869 (21%)

996 (25%)

$200,000 or more

238 (5.9%)

400 (10%)

Highest level of education completed

8,000

< High school

49 (1.2%)

75 (1.9%)

High school graduate or GED

122 (3.0%)

130 (3.3%)

Some college or technical school

877 (22%)

950 (24%)

College graduate

1,120 (28%)

1,018 (26%)

Graduate or professional degree

1,896 (47%)

1,763 (45%)

Smoking status

8,000

Current daily smoker

319 (7.8%)

339 (8.6%)

Current some-day smoker

117 (2.9%)

151 (3.8%)

Former smoker

1,055 (26%)

1,207 (31%)

Never smoker

2,573 (63%)

2,239 (57%)

1Mean (SD); n (%)

Part 1: Multiple Linear Regression

Mulptiple linear regresson

mlr_model <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking,
              data = data_1)

tidy(mlr_model, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Maximum Model: All Candidate Predictors",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Maximum Model: All Candidate Predictors
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 11.0409 0.9593 11.5092 0.0000 9.1604 12.9214
physhlth_days 0.2656 0.0101 26.3841 0.0000 0.2459 0.2853
bmi 0.0334 0.0132 2.5274 0.0115 0.0075 0.0593
sexMale -1.3904 0.1675 -8.3007 0.0000 -1.7187 -1.0620
exerciseYes -0.6512 0.2147 -3.0326 0.0024 -1.0721 -0.2303
age_group25-29 -1.0566 0.5194 -2.0343 0.0420 -2.0747 -0.0385
age_group30-34 -1.0939 0.5065 -2.1600 0.0308 -2.0867 -0.1012
age_group35-39 -1.8110 0.4885 -3.7072 0.0002 -2.7686 -0.8534
age_group40-44 -2.8931 0.4875 -5.9346 0.0000 -3.8487 -1.9375
age_group45-49 -3.0562 0.4977 -6.1408 0.0000 -4.0318 -2.0806
age_group50-54 -3.5167 0.4832 -7.2775 0.0000 -4.4640 -2.5695
age_group55-59 -4.4960 0.4755 -9.4543 0.0000 -5.4282 -3.5638
age_group60-64 -4.5221 0.4585 -9.8633 0.0000 -5.4209 -3.6234
age_group65-69 -5.5779 0.4502 -12.3903 0.0000 -6.4604 -4.6955
age_group70-74 -6.0254 0.4574 -13.1728 0.0000 -6.9220 -5.1287
age_group75-79 -6.2866 0.4748 -13.2392 0.0000 -7.2174 -5.3557
age_group80+ -6.8197 0.4768 -14.3019 0.0000 -7.7544 -5.8850
income$15,000 to < $25,000 -1.6370 0.4690 -3.4905 0.0005 -2.5564 -0.7177
income$25,000 to < $35,000 -2.0764 0.4480 -4.6351 0.0000 -2.9545 -1.1982
income$35,000 to < $50,000 -2.5469 0.4382 -5.8122 0.0000 -3.4058 -1.6879
income$50,000 to < $100,000 -3.0505 0.4107 -7.4277 0.0000 -3.8555 -2.2454
income$100,000 to < $200,000 -3.4998 0.4292 -8.1537 0.0000 -4.3413 -2.6584
income$200,000 or more -3.7841 0.5004 -7.5628 0.0000 -4.7649 -2.8033
educationHigh school graduate or GED 0.0799 0.8107 0.0986 0.9215 -1.5092 1.6690
educationSome college or technical school 1.0768 0.6897 1.5612 0.1185 -0.2753 2.4288
educationCollege graduate 1.7909 0.6912 2.5911 0.0096 0.4360 3.1458
educationGraduate or professional degree 1.7378 0.6925 2.5095 0.0121 0.3803 3.0953
smokingCurrent some-day smoker -1.5867 0.5352 -2.9645 0.0030 -2.6359 -0.5375
smokingFormer smoker -1.9897 0.3371 -5.9020 0.0000 -2.6506 -1.3289
smokingNever smoker -2.9368 0.3216 -9.1313 0.0000 -3.5673 -2.3063

Fitted regression equation

\[\text{mental health days} = 11.041 + 0.266 \cdot \text{physical health days} + 0.033 \cdot \text{bmi} + -1.390 \cdot \text{sexMale} + -0.651 \cdot \text{exerciseYes} + -1.057 \cdot \text{age_group25-29} + -1.094 \cdot \text{age_group30-34} + -1.811 \cdot \text{age_group35-39} + -2.893 \cdot \text{age_group40-44} + -3.056 \cdot \text{age_group45-49} + -3.517 \cdot \text{age_group50-54} + -4.496 \cdot \text{age_group55-59} + -4.522 \cdot \text{age_group60-64} + -5.578 \cdot \text{age_group65-69} + -6.025 \cdot \text{age_group70-74} + -6.287 \cdot \text{age_group75-79} + -6.820 \cdot \text{age_group80+} + -1.637 \cdot \text{income15,000-<25,000} + -2.076 \cdot \text{income25,000-<35,000} + -2.547 \cdot \text{income35,000-<50,000} + -3.051 \cdot \text{income50,000-<100,000} + -3.500 \cdot \text{income100,000-<200,000} + -3.784 \cdot \text{income$200,000 or more} + 0.080 \cdot \text{educationHigh school graduate or GED} + 1.077 \cdot \text{educationSome college or technical school} + 1.791 \cdot \text{educationCollege graduate} + 1.738 \cdot \text{educationGraduate or professional degree} + -1.587 \cdot \text{smokingCurrent some-day smoker} + -1.990 \cdot \text{smokingFormer smoker} + -2.937 \cdot \text{smokingNever smoker} + \varepsilon\] #### Interpretation of regression equation • physhlth_days (continuous predictor) - Every additional day with poor physical health was associated with 0.266 more days of poor mental health, adjusting for other variables.

• bmi (continuous predictor) - Every additional one unit increase in BMI was associated with 0.033 more days of poor mental health, adjusting for other variables.

• sex: Female vs. Male (reference) - Males had 1.390 fewer days of poor mental health compared to females, adjusting for other variables.

• exercise: Yes vs. No (reference) - Those exercise had 0.651 fewer days of poor mental health compared to those who did not exercise, adjusting for other variables.

• One age_group coefficient of your choice - Those aged 80 or more had 6.820 fewer days of poor mental health compared to those aged 18 to 24, adjusting for other variables.

• Two income coefficients of your choice, compared to the reference category (Less than $15,000) - Those with an income of 50,000 to 100,000 had 3.051 fewer days of poor mental health compared to those with an income less than 15,000, adjusting for other variables. - Those with an income of 200,000 or more had 3.784 fewer days of poor mental health compared to those with an income less than 15,000, adjusting for other variables.

Model level statistics

summary(mlr_model)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = data_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               11.04090    0.95931  11.509  < 2e-16
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexMale                                   -1.39038    0.16750  -8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to < $25,000                -1.63703    0.46899  -3.491 0.000485
## income$25,000 to < $35,000                -2.07637    0.44797  -4.635 3.63e-06
## income$35,000 to < $50,000                -2.54685    0.43819  -5.812 6.40e-09
## income$50,000 to < $100,000               -3.05048    0.41069  -7.428 1.22e-13
## income$100,000 to < $200,000              -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHigh school graduate or GED       0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingCurrent some-day smoker            -1.58670    0.53523  -2.965 0.003041
## smokingFormer smoker                      -1.98971    0.33713  -5.902 3.74e-09
## smokingNever smoker                       -2.93681    0.32162  -9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexMale                                   ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to < $25,000                ***
## income$25,000 to < $35,000                ***
## income$35,000 to < $50,000                ***
## income$50,000 to < $100,000               ***
## income$100,000 to < $200,000              ***
## income$200,000 or more                    ***
## educationHigh school graduate or GED         
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingCurrent some-day smoker            ** 
## smokingFormer smoker                      ***
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

• R-squared: proportion of variance in mentally unhealthy days explained by all predictors The R-squared is 0.185. This value describes how well the multiple linear regression model fits.

• Adjusted R-squared: how it differs from R-squared and what it adds The adjusted R-squared is 0.182. This value describes how well the multiple linear regression model fits, adjusting for the number of covariates.

• Root MSE (residual standard error): average prediction error of the model The residual standard error is 7.352. This value describes how close the observed data points are to the regression model.

• Overall F-test: state H0, F-statistic, degrees of freedom (numerator and denominator), p-value, and conclusion The null hypothesis is that the number of days with poor physical health is not associated with the number of days with poor mental health, adjusting for all other variables. The F-statistic is 62.52 with 29 degrees of freedom and a p-value <2.2e-16. We can reject the null hypothesis and claim that the number of days with poor physical health is associated with the number of days with poor mental health, adjusting for all other variables.

Part 2 Tests of Hypotheses

Type III

Anova(mlr_model, type = "III") %>%
  as.data.frame() %>%
  rownames_to_column("Source") %>%
  filter(Source != "(Intercept)") %>%
  mutate(
    Significant = ifelse(`Pr(>F)` < 0.05, "Yes", "No"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 8. Type III Partial F-Tests for Each Predictor",
    col.names = c("Source", "Sum of Sq", "df", "F value", "p-value", "Significant (α = 0.05)")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  column_spec(6, bold = TRUE)
Table 8. Type III Partial F-Tests for Each Predictor
Source Sum of Sq df F value p-value Significant (α = 0.05)
physhlth_days 37622.7952 1 696.1198 0.0000 Yes
bmi 345.2408 1 6.3879 0.0115 Yes
sex 3723.8662 1 68.9012 0.0000 Yes
exercise 497.0434 1 9.1966 0.0024 Yes
age_group 30092.1774 12 46.3986 0.0000 Yes
income 4733.8943 6 14.5982 0.0000 Yes
education 1265.1504 4 5.8521 0.0001 Yes
smoking 5101.1076 3 31.4613 0.0000 Yes
Residuals 430750.0872 7970 NA NA NA

All the predictors have statistically significant partial associations at α = 0.05.

Chunk test for income

mlr_model_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking,
              data = data_1)

anova(mlr_model_no_income, mlr_model) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (without income)", "Full"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Chunk Test: Does income add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Chunk Test: Does income add to the model?
Model Res. df RSS df Sum of Sq F p-value
Reduced (without income) 7976 435484.0 NA NA NA NA
Full 7970 430750.1 6 4733.894 14.5982 0

The null hypothesis is that income does not significantly add to the model, whereas the alternative hypothesis is that income does significantly add to the model. The F-statistic is 14.598 with 6 degrees of freedom and a p-value <0.05. We can reject the null hypothesis and claim that income does significantly add to the model.

Chunk test for education

mlr_model_no_education <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking,
              data = data_1)

anova(mlr_model_no_education, mlr_model) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (without education)", "Full"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Chunk Test: Does education add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Chunk Test: Does education add to the model?
Model Res. df RSS df Sum of Sq F p-value
Reduced (without education) 7974 432015.2 NA NA NA NA
Full 7970 430750.1 4 1265.15 5.8521 1e-04

The null hypothesis is that education does not significantly add to the model, whereas the alternative hypothesis is that education does significantly add to the model. The F-statistic is 5.852 with 4 degrees of freedom and a p-value <0.05. We can reject the null hypothesis and claim that education does significantly add to the model.

#Chunk test synthesis The type III results showed that all variables in the model have significant impact on mental health days. The chunk tests aimed to determine if the model would be stronger with or without income and education, which showed that the full model was significantly more predictive for both variables. Based on the chunk tests, income has a larger independent contribution compared to education based on the larger F-statistic and both have significant p-values. Physical health days had the highest F-statistic meaning this variable has the strongest independent contribution to the model. The chunk tests show the individual impact of the variables of interest to see if the model would be impacted by their removal.

Part 3: Interaction Analysis

Create a binary smoking variable called smoker_current

data_2 <- data_1 %>%
  mutate(
        smoking    = factor(if_else(smoking == "Current daily smoker" | 
                             smoking == "Current some-day smoker", "Current smoker", "Non-smoker")))

Fit two models for smoking and smoking as an interaction

data_2$smoking_reref <- relevel(data_2$smoking, ref = "Non-smoker")
data_2$sex_reref <- relevel(data_2$sex, ref = "Male")

mlr_model_smoking_a <- lm(menthlth_days ~ physhlth_days + bmi + sex_reref + smoking_reref + 
                        exercise + age_group + income + education,
              data = data_2)

mlr_model_smoking_b <- lm(menthlth_days ~ physhlth_days + bmi + sex_reref*smoking_reref +
                          exercise + age_group + income + education,
              data = data_2)

tidy(mlr_model_smoking_b, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Smoking Interaction",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Smoking Interaction
Term Estimate SE t p-value CI Lower CI Upper
(Intercept) 6.8994 0.9286 7.4301 0.0000 5.0791 8.7196
physhlth_days 0.2686 0.0101 26.6788 0.0000 0.2489 0.2883
bmi 0.0331 0.0132 2.5017 0.0124 0.0072 0.0590
sex_rerefFemale 1.1855 0.1775 6.6784 0.0000 0.8376 1.5335
smoking_rerefCurrent smoker 1.5208 0.3654 4.1621 0.0000 0.8045 2.2371
exerciseYes -0.6728 0.2150 -3.1301 0.0018 -1.0942 -0.2515
age_group25-29 -0.9202 0.5196 -1.7709 0.0766 -1.9388 0.0984
age_group30-34 -0.8924 0.5059 -1.7640 0.0778 -1.8841 0.0993
age_group35-39 -1.5929 0.4875 -3.2673 0.0011 -2.5485 -0.6372
age_group40-44 -2.6286 0.4856 -5.4127 0.0000 -3.5806 -1.6766
age_group45-49 -2.8425 0.4969 -5.7209 0.0000 -3.8165 -1.8686
age_group50-54 -3.2778 0.4820 -6.8012 0.0000 -4.2226 -2.3331
age_group55-59 -4.2499 0.4740 -8.9652 0.0000 -5.1791 -3.3206
age_group60-64 -4.2640 0.4567 -9.3364 0.0000 -5.1593 -3.3688
age_group65-69 -5.2506 0.4466 -11.7563 0.0000 -6.1261 -4.3751
age_group70-74 -5.7111 0.4544 -12.5686 0.0000 -6.6018 -4.8203
age_group75-79 -5.9076 0.4702 -12.5646 0.0000 -6.8292 -4.9859
age_group80+ -6.4995 0.4736 -13.7239 0.0000 -7.4278 -5.5711
income$15,000 to < $25,000 -1.6357 0.4700 -3.4804 0.0005 -2.5570 -0.7144
income$25,000 to < $35,000 -2.0746 0.4486 -4.6243 0.0000 -2.9540 -1.1952
income$35,000 to < $50,000 -2.5455 0.4392 -5.7964 0.0000 -3.4064 -1.6847
income$50,000 to < $100,000 -3.0430 0.4116 -7.3935 0.0000 -3.8498 -2.2362
income$100,000 to < $200,000 -3.5097 0.4300 -8.1623 0.0000 -4.3526 -2.6668
income$200,000 or more -3.8405 0.5010 -7.6651 0.0000 -4.8226 -2.8583
educationHigh school graduate or GED 0.1256 0.8124 0.1546 0.8772 -1.4669 1.7180
educationSome college or technical school 1.1179 0.6912 1.6172 0.1059 -0.2371 2.4729
educationCollege graduate 1.8179 0.6928 2.6239 0.0087 0.4598 3.1760
educationGraduate or professional degree 1.6691 0.6943 2.4040 0.0162 0.3081 3.0300
sex_rerefFemale:smoking_rerefCurrent smoker 1.2833 0.5171 2.4819 0.0131 0.2697 2.2968

ANOVA test for two models

anova(mlr_model_smoking_a, mlr_model_smoking_b) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Full model", "Smoking interaction"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Interaction Test: Does smoking interact with sex?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Interaction Test: Does smoking interact with sex?
Model Res. df RSS df Sum of Sq F p-value
Full model 7972 432508.9 NA NA NA NA
Smoking interaction 7971 432174.9 1 333.9749 6.1598 0.0131

The null hypothesis is that there is no interaction between sex and smoking status. The alternative hypothesis is that there is an interaction between sex and smoking status. The F-statistic is 6.160 with 1 degree of freedom and a p-value of 0.0131. We can reject the null hypothesis and conclude that there is an interaction between sex and smoking status

Sex and Smoking Interaction

The coefficient for male smokers is 1.521. This means that being a smoker is associated with having 1.521 more mentally unhealthy days among males.

The coefficient for female smokers is 2.804. This means that being a smoker is associated with having 2.804 more mentally unhealthy days among females.

The model using sex interacting with smoking shows that smoking among females is more strongly associated with poor mental health, as compared to males. While male smokers experienced 1.521 more days of poor mental health compared to non-smokers, female smokers experienced 2.804 more days or 1.283 more days.

Visualized Sex and Smoking Interaction

pred_int <- ggpredict(mlr_model_smoking_b, terms = c("smoking_reref", "sex_reref"))

ggplot(pred_int, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_point() +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, color = NA) +
  labs(
    title    = "Predicted Mental Health Days by Smoker Status and Sex",
    subtitle = "From interaction model: smoker status * sex",
    x        = "Smoking Status",
    y        = "Predicted Mentally Unhealthy Days",
    color    = "Sex",
    fill     = "Sex"
  ) +
  theme_minimal(base_size = 13) +
  scale_color_brewer(palette = "Set1")

The graph shows the associations between smoking and sex on mentally unhealthy days. The graph shows that the number of days with poor mental health increases when men or women smoke compared to those who are non-smokers. However, females who are smokers and non-smokers experience more mentally unhealthy compared to their male counterparts. The difference in poor mental health days is larger between men and women who smoke, which shows that smoking impacts the number of days with poor mental health for women more than men.

Part 4: Reflection

A. Findings and Limitations (6 points) What do the results suggest about the determinants of mental health among U.S. adults? The results of this assignment show that there are many factors involved in the mental health of U.S. adults. Age and income were heavily involved in the number of poor mental health days. For example, those aged 80 or more had 6.820 fewer days of poor mental health compared to those aged 18 to 24, adjusting for other variables. Also, those with an income of 200,000 or more had 3.784 fewer days of poor mental health compared to those with an income less than 15,000, adjusting for other variables. Many other variables such as smoking status, sex, bmi, and physical health were also associated with mental health outcomes. Education was not as clearly directly associated with mental health as income and age, with those gaining a high school degree or some college not fairing significantly better than those with less than a high school degree.

The data analyzed in this assignment is from a cross-sectional survey. This means that all data is presented from a snapshot of a sample population in 2023. The analysis cannot determine the direction of any associations claimed. Additional issues from the dataset could be from the responders being more likely to have poor mental health as a result of their voluntary inclusion in the survey. Additionally, confounders not controlled in these models include diet, alcohol or drug-use, and mental health diseases, which could cause spurious associations as these are likely involved in mental health among U.S. adults.

It’s important that this assignment reported adjusted R-squared in addition to simply R-squared as the adjusted value accounts for the additional complexity of the models. This means that additional covariates will be more heavily scrunitized in the adjusted R-squared values. This is important because adding as many covariates as possible will be penalized if they are not significantly relevant to the outcome. In addition, it’s important to test covariate importance in the models as chunks to compare how the model is impacted by the removal or addition of specific covariates. This is different from relying on individual t-tests as the model chunk tests will control for certain covariates, while testing to see if additional chunks of covariates improve this already defined model.