library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gtsummary)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ggeffects)
library(haven)
brfss_raw <- read_xpt("data/LLCP2023.XPT")
Total observations: 433,323 Total rows : 350
brfss_subset <- brfss_raw %>%
select(MENTHLTH, PHYSHLTH, `_BMI5`, SEXVAR, EXERANY2, `_AGEG5YR`, `_INCOMG1`, EDUCA, `_SMOKER3`)
# Recode variables
brfss_subset <- brfss_subset %>%
mutate(
#menthlth_days
MENTHLTH = if_else(MENTHLTH %in% c(77, 99), NA_real_, MENTHLTH),
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
TRUE ~ NA_real_),
#physhlth_days
PHYSHLTH = if_else(PHYSHLTH %in% c(77, 99), NA_real_, PHYSHLTH),
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
TRUE ~ NA_real_),
#bmi
bmi = `_BMI5` / 100,
bmi = if_else(bmi %in% c(9999), NA_real_, bmi),
#sex
sex = factor(ifelse(SEXVAR == 1, "Male", "Female")),
#exercise
EXERANY2 = if_else(EXERANY2 %in% c(7, 9), NA_real_, EXERANY2),
exercise = factor(ifelse(EXERANY2 == 1, "Yes", "No")),
#age_group
`_AGEG5YR` = if_else(`_AGEG5YR` %in% c(14), NA_real_, `_AGEG5YR`),
age_group = factor(case_when(
`_AGEG5YR` == 1 ~ "18-24",
`_AGEG5YR` == 2 ~ "25-29",
`_AGEG5YR` == 3 ~ "30-34",
`_AGEG5YR` == 4 ~ "35-39",
`_AGEG5YR` == 5 ~ "40-44",
`_AGEG5YR` == 6 ~ "45-49",
`_AGEG5YR` == 7 ~ "50-54",
`_AGEG5YR` == 8 ~ "55-59",
`_AGEG5YR` == 9 ~ "60-64",
`_AGEG5YR` == 10 ~ "65-69",
`_AGEG5YR` == 11 ~ "70-74",
`_AGEG5YR` == 12 ~ "75-79",
`_AGEG5YR` == 13 ~ "80+"),
levels = c("18-24", "25-29", "30-34", "35-39", "40-44", "45-49",
"50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+")),
#income
`_INCOMG1` = if_else(`_INCOMG1` %in% c(9), NA_real_, `_INCOMG1`),
income = factor(case_when(
`_INCOMG1` == 1 ~ "< $15,000",
`_INCOMG1` == 2 ~ "$15,000 to < $25,000",
`_INCOMG1` == 3 ~ "$25,000 to < $35,000",
`_INCOMG1` == 4 ~ "$35,000 to < $50,000",
`_INCOMG1` == 5 ~ "$50,000 to < $100,000",
`_INCOMG1` == 6 ~ "$100,000 to < $200,000",
`_INCOMG1` == 7 ~ "$200,000 or more"),
levels = c("< $15,000", "$15,000 to < $25,000", "$25,000 to < $35,000", "$35,000 to < $50,000",
"$50,000 to < $100,000", "$100,000 to < $200,000", "$200,000 or more")),
#education
EDUCA = if_else(EDUCA %in% c(9), NA_real_, EDUCA),
education = factor(case_when(
EDUCA == 1 | EDUCA == 2 ~ "< High school",
EDUCA == 3 ~ "High school graduate or GED",
EDUCA == 4 ~ "Some college or technical school",
EDUCA == 5 ~ "College graduate",
EDUCA == 6 ~ "Graduate or professional degree"),
levels = c("< High school", "High school graduate or GED",
"Some college or technical school", "College graduate", "Graduate or professional degree")),
#smoking
`_SMOKER3` = if_else(`_SMOKER3` %in% c(9), NA_real_, `_SMOKER3`),
smoking = factor(case_when(
`_SMOKER3` == 1 ~ "Current daily smoker",
`_SMOKER3` == 2 ~ "Current some-day smoker",
`_SMOKER3` == 3 ~ "Former smoker",
`_SMOKER3` == 4 ~ "Never smoker"),
levels = c("Current daily smoker", "Current some-day smoker",
"Former smoker", "Never smoker")))
Report the number and percentage of missing observations for menthlth_days, physhlth_days, and income
mean(is.na(brfss_subset$menthlth_days)) * 100 # 1.87%
## [1] 1.871122
mean(is.na(brfss_subset$physhlth_days)) * 100 # 2.49%
## [1] 2.488906
mean(is.na(brfss_subset$income)) * 100 # 19.99%
## [1] 19.9904
set.seed(1220)
brfss_analytic <- brfss_subset %>%
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) %>%
slice_sample(n = 8000)
write_rds(brfss_analytic, "data/brfss_subset_2023.rds")
data_1 <- readRDS("data/brfss_subset_2023.rds")
data_1 %>%
tbl_summary(
by = sex,
include = c(menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking),
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body mass index",
age_group ~ "Age group in 5-year categories",
income ~ "Annual household income",
exercise ~ "Any physical activity (past 30 days)",
education ~ "Highest level of education completed",
smoking ~ "Smoking status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) %>%
add_n() %>%
# add_overall() %>%
bold_labels() %>%
#italicize_levels() %>%
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000)**") %>%
as_flex_table()
Characteristic | N | Female | Male |
|---|---|---|---|
Mentally unhealthy days (past 30) | 8,000 | 5.1 (8.6) | 3.5 (7.5) |
Physically unhealthy days (past 30) | 8,000 | 4.9 (8.9) | 4.0 (8.4) |
Body mass index | 8,000 | 28.7 (7.0) | 28.7 (6.0) |
Any physical activity (past 30 days) | 8,000 | 3,094 (76%) | 3,146 (80%) |
Age group in 5-year categories | 8,000 | ||
18-24 | 171 (4.2%) | 235 (6.0%) | |
25-29 | 189 (4.7%) | 219 (5.6%) | |
30-34 | 210 (5.2%) | 253 (6.4%) | |
35-39 | 302 (7.4%) | 263 (6.7%) | |
40-44 | 292 (7.2%) | 290 (7.4%) | |
45-49 | 252 (6.2%) | 266 (6.8%) | |
50-54 | 303 (7.5%) | 305 (7.7%) | |
55-59 | 352 (8.7%) | 308 (7.8%) | |
60-64 | 379 (9.3%) | 408 (10%) | |
65-69 | 483 (12%) | 418 (11%) | |
70-74 | 426 (10%) | 382 (9.7%) | |
75-79 | 338 (8.3%) | 325 (8.3%) | |
80+ | 367 (9.0%) | 264 (6.7%) | |
Annual household income | 8,000 | ||
< $15,000 | 247 (6.1%) | 160 (4.1%) | |
$15,000 to < $25,000 | 370 (9.1%) | 271 (6.9%) | |
$25,000 to < $35,000 | 495 (12%) | 376 (9.6%) | |
$35,000 to < $50,000 | 585 (14%) | 482 (12%) | |
$50,000 to < $100,000 | 1,260 (31%) | 1,251 (32%) | |
$100,000 to < $200,000 | 869 (21%) | 996 (25%) | |
$200,000 or more | 238 (5.9%) | 400 (10%) | |
Highest level of education completed | 8,000 | ||
< High school | 49 (1.2%) | 75 (1.9%) | |
High school graduate or GED | 122 (3.0%) | 130 (3.3%) | |
Some college or technical school | 877 (22%) | 950 (24%) | |
College graduate | 1,120 (28%) | 1,018 (26%) | |
Graduate or professional degree | 1,896 (47%) | 1,763 (45%) | |
Smoking status | 8,000 | ||
Current daily smoker | 319 (7.8%) | 339 (8.6%) | |
Current some-day smoker | 117 (2.9%) | 151 (3.8%) | |
Former smoker | 1,055 (26%) | 1,207 (31%) | |
Never smoker | 2,573 (63%) | 2,239 (57%) | |
1Mean (SD); n (%) | |||
mlr_model <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking,
data = data_1)
tidy(mlr_model, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Maximum Model: All Candidate Predictors",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 11.0409 | 0.9593 | 11.5092 | 0.0000 | 9.1604 | 12.9214 |
| physhlth_days | 0.2656 | 0.0101 | 26.3841 | 0.0000 | 0.2459 | 0.2853 |
| bmi | 0.0334 | 0.0132 | 2.5274 | 0.0115 | 0.0075 | 0.0593 |
| sexMale | -1.3904 | 0.1675 | -8.3007 | 0.0000 | -1.7187 | -1.0620 |
| exerciseYes | -0.6512 | 0.2147 | -3.0326 | 0.0024 | -1.0721 | -0.2303 |
| age_group25-29 | -1.0566 | 0.5194 | -2.0343 | 0.0420 | -2.0747 | -0.0385 |
| age_group30-34 | -1.0939 | 0.5065 | -2.1600 | 0.0308 | -2.0867 | -0.1012 |
| age_group35-39 | -1.8110 | 0.4885 | -3.7072 | 0.0002 | -2.7686 | -0.8534 |
| age_group40-44 | -2.8931 | 0.4875 | -5.9346 | 0.0000 | -3.8487 | -1.9375 |
| age_group45-49 | -3.0562 | 0.4977 | -6.1408 | 0.0000 | -4.0318 | -2.0806 |
| age_group50-54 | -3.5167 | 0.4832 | -7.2775 | 0.0000 | -4.4640 | -2.5695 |
| age_group55-59 | -4.4960 | 0.4755 | -9.4543 | 0.0000 | -5.4282 | -3.5638 |
| age_group60-64 | -4.5221 | 0.4585 | -9.8633 | 0.0000 | -5.4209 | -3.6234 |
| age_group65-69 | -5.5779 | 0.4502 | -12.3903 | 0.0000 | -6.4604 | -4.6955 |
| age_group70-74 | -6.0254 | 0.4574 | -13.1728 | 0.0000 | -6.9220 | -5.1287 |
| age_group75-79 | -6.2866 | 0.4748 | -13.2392 | 0.0000 | -7.2174 | -5.3557 |
| age_group80+ | -6.8197 | 0.4768 | -14.3019 | 0.0000 | -7.7544 | -5.8850 |
| income$15,000 to < $25,000 | -1.6370 | 0.4690 | -3.4905 | 0.0005 | -2.5564 | -0.7177 |
| income$25,000 to < $35,000 | -2.0764 | 0.4480 | -4.6351 | 0.0000 | -2.9545 | -1.1982 |
| income$35,000 to < $50,000 | -2.5469 | 0.4382 | -5.8122 | 0.0000 | -3.4058 | -1.6879 |
| income$50,000 to < $100,000 | -3.0505 | 0.4107 | -7.4277 | 0.0000 | -3.8555 | -2.2454 |
| income$100,000 to < $200,000 | -3.4998 | 0.4292 | -8.1537 | 0.0000 | -4.3413 | -2.6584 |
| income$200,000 or more | -3.7841 | 0.5004 | -7.5628 | 0.0000 | -4.7649 | -2.8033 |
| educationHigh school graduate or GED | 0.0799 | 0.8107 | 0.0986 | 0.9215 | -1.5092 | 1.6690 |
| educationSome college or technical school | 1.0768 | 0.6897 | 1.5612 | 0.1185 | -0.2753 | 2.4288 |
| educationCollege graduate | 1.7909 | 0.6912 | 2.5911 | 0.0096 | 0.4360 | 3.1458 |
| educationGraduate or professional degree | 1.7378 | 0.6925 | 2.5095 | 0.0121 | 0.3803 | 3.0953 |
| smokingCurrent some-day smoker | -1.5867 | 0.5352 | -2.9645 | 0.0030 | -2.6359 | -0.5375 |
| smokingFormer smoker | -1.9897 | 0.3371 | -5.9020 | 0.0000 | -2.6506 | -1.3289 |
| smokingNever smoker | -2.9368 | 0.3216 | -9.1313 | 0.0000 | -3.5673 | -2.3063 |
\[\text{mental health days} = 11.041 + 0.266 \cdot \text{physical health days} + 0.033 \cdot \text{bmi} + -1.390 \cdot \text{sexMale} + -0.651 \cdot \text{exerciseYes} + -1.057 \cdot \text{age_group25-29} + -1.094 \cdot \text{age_group30-34} + -1.811 \cdot \text{age_group35-39} + -2.893 \cdot \text{age_group40-44} + -3.056 \cdot \text{age_group45-49} + -3.517 \cdot \text{age_group50-54} + -4.496 \cdot \text{age_group55-59} + -4.522 \cdot \text{age_group60-64} + -5.578 \cdot \text{age_group65-69} + -6.025 \cdot \text{age_group70-74} + -6.287 \cdot \text{age_group75-79} + -6.820 \cdot \text{age_group80+} + -1.637 \cdot \text{income15,000-<25,000} + -2.076 \cdot \text{income25,000-<35,000} + -2.547 \cdot \text{income35,000-<50,000} + -3.051 \cdot \text{income50,000-<100,000} + -3.500 \cdot \text{income100,000-<200,000} + -3.784 \cdot \text{income$200,000 or more} + 0.080 \cdot \text{educationHigh school graduate or GED} + 1.077 \cdot \text{educationSome college or technical school} + 1.791 \cdot \text{educationCollege graduate} + 1.738 \cdot \text{educationGraduate or professional degree} + -1.587 \cdot \text{smokingCurrent some-day smoker} + -1.990 \cdot \text{smokingFormer smoker} + -2.937 \cdot \text{smokingNever smoker} + \varepsilon\] #### Interpretation of regression equation • physhlth_days (continuous predictor) - Every additional day with poor physical health was associated with 0.266 more days of poor mental health, adjusting for other variables.
• bmi (continuous predictor) - Every additional one unit increase in BMI was associated with 0.033 more days of poor mental health, adjusting for other variables.
• sex: Female vs. Male (reference) - Males had 1.390 fewer days of poor mental health compared to females, adjusting for other variables.
• exercise: Yes vs. No (reference) - Those exercise had 0.651 fewer days of poor mental health compared to those who did not exercise, adjusting for other variables.
• One age_group coefficient of your choice - Those aged 80 or more had 6.820 fewer days of poor mental health compared to those aged 18 to 24, adjusting for other variables.
• Two income coefficients of your choice, compared to the reference category (Less than $15,000) - Those with an income of 50,000 to 100,000 had 3.051 fewer days of poor mental health compared to those with an income less than 15,000, adjusting for other variables. - Those with an income of 200,000 or more had 3.784 fewer days of poor mental health compared to those with an income less than 15,000, adjusting for other variables.
summary(mlr_model)
##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = data_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.04090 0.95931 11.509 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexMale -1.39038 0.16750 -8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000 to < $25,000 -1.63703 0.46899 -3.491 0.000485
## income$25,000 to < $35,000 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000 to < $50,000 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000 to < $100,000 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000 to < $200,000 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school graduate or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexMale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000 to < $25,000 ***
## income$25,000 to < $35,000 ***
## income$35,000 to < $50,000 ***
## income$50,000 to < $100,000 ***
## income$100,000 to < $200,000 ***
## income$200,000 or more ***
## educationHigh school graduate or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
• R-squared: proportion of variance in mentally unhealthy days explained by all predictors The R-squared is 0.185. This value describes how well the multiple linear regression model fits.
• Adjusted R-squared: how it differs from R-squared and what it adds The adjusted R-squared is 0.182. This value describes how well the multiple linear regression model fits, adjusting for the number of covariates.
• Root MSE (residual standard error): average prediction error of the model The residual standard error is 7.352. This value describes how close the observed data points are to the regression model.
• Overall F-test: state H0, F-statistic, degrees of freedom (numerator and denominator), p-value, and conclusion The null hypothesis is that the number of days with poor physical health is not associated with the number of days with poor mental health, adjusting for all other variables. The F-statistic is 62.52 with 29 degrees of freedom and a p-value <2.2e-16. We can reject the null hypothesis and claim that the number of days with poor physical health is associated with the number of days with poor mental health, adjusting for all other variables.
Anova(mlr_model, type = "III") %>%
as.data.frame() %>%
rownames_to_column("Source") %>%
filter(Source != "(Intercept)") %>%
mutate(
Significant = ifelse(`Pr(>F)` < 0.05, "Yes", "No"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table 8. Type III Partial F-Tests for Each Predictor",
col.names = c("Source", "Sum of Sq", "df", "F value", "p-value", "Significant (α = 0.05)")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
column_spec(6, bold = TRUE)
| Source | Sum of Sq | df | F value | p-value | Significant (α = 0.05) |
|---|---|---|---|---|---|
| physhlth_days | 37622.7952 | 1 | 696.1198 | 0.0000 | Yes |
| bmi | 345.2408 | 1 | 6.3879 | 0.0115 | Yes |
| sex | 3723.8662 | 1 | 68.9012 | 0.0000 | Yes |
| exercise | 497.0434 | 1 | 9.1966 | 0.0024 | Yes |
| age_group | 30092.1774 | 12 | 46.3986 | 0.0000 | Yes |
| income | 4733.8943 | 6 | 14.5982 | 0.0000 | Yes |
| education | 1265.1504 | 4 | 5.8521 | 0.0001 | Yes |
| smoking | 5101.1076 | 3 | 31.4613 | 0.0000 | Yes |
| Residuals | 430750.0872 | 7970 | NA | NA | NA |
All the predictors have statistically significant partial associations at α = 0.05.
mlr_model_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking,
data = data_1)
anova(mlr_model_no_income, mlr_model) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Reduced (without income)", "Full"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Chunk Test: Does income add to the model?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (without income) | 7976 | 435484.0 | NA | NA | NA | NA |
| Full | 7970 | 430750.1 | 6 | 4733.894 | 14.5982 | 0 |
The null hypothesis is that income does not significantly add to the model, whereas the alternative hypothesis is that income does significantly add to the model. The F-statistic is 14.598 with 6 degrees of freedom and a p-value <0.05. We can reject the null hypothesis and claim that income does significantly add to the model.
mlr_model_no_education <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking,
data = data_1)
anova(mlr_model_no_education, mlr_model) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Reduced (without education)", "Full"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Chunk Test: Does education add to the model?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Reduced (without education) | 7974 | 432015.2 | NA | NA | NA | NA |
| Full | 7970 | 430750.1 | 4 | 1265.15 | 5.8521 | 1e-04 |
The null hypothesis is that education does not significantly add to the model, whereas the alternative hypothesis is that education does significantly add to the model. The F-statistic is 5.852 with 4 degrees of freedom and a p-value <0.05. We can reject the null hypothesis and claim that education does significantly add to the model.
#Chunk test synthesis The type III results showed that all variables in the model have significant impact on mental health days. The chunk tests aimed to determine if the model would be stronger with or without income and education, which showed that the full model was significantly more predictive for both variables. Based on the chunk tests, income has a larger independent contribution compared to education based on the larger F-statistic and both have significant p-values. Physical health days had the highest F-statistic meaning this variable has the strongest independent contribution to the model. The chunk tests show the individual impact of the variables of interest to see if the model would be impacted by their removal.
data_2 <- data_1 %>%
mutate(
smoking = factor(if_else(smoking == "Current daily smoker" |
smoking == "Current some-day smoker", "Current smoker", "Non-smoker")))
data_2$smoking_reref <- relevel(data_2$smoking, ref = "Non-smoker")
data_2$sex_reref <- relevel(data_2$sex, ref = "Male")
mlr_model_smoking_a <- lm(menthlth_days ~ physhlth_days + bmi + sex_reref + smoking_reref +
exercise + age_group + income + education,
data = data_2)
mlr_model_smoking_b <- lm(menthlth_days ~ physhlth_days + bmi + sex_reref*smoking_reref +
exercise + age_group + income + education,
data = data_2)
tidy(mlr_model_smoking_b, conf.int = TRUE) |>
mutate(across(where(is.numeric), \(x) round(x, 4))) |>
kable(
caption = "Smoking Interaction",
col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
) |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | Estimate | SE | t | p-value | CI Lower | CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.8994 | 0.9286 | 7.4301 | 0.0000 | 5.0791 | 8.7196 |
| physhlth_days | 0.2686 | 0.0101 | 26.6788 | 0.0000 | 0.2489 | 0.2883 |
| bmi | 0.0331 | 0.0132 | 2.5017 | 0.0124 | 0.0072 | 0.0590 |
| sex_rerefFemale | 1.1855 | 0.1775 | 6.6784 | 0.0000 | 0.8376 | 1.5335 |
| smoking_rerefCurrent smoker | 1.5208 | 0.3654 | 4.1621 | 0.0000 | 0.8045 | 2.2371 |
| exerciseYes | -0.6728 | 0.2150 | -3.1301 | 0.0018 | -1.0942 | -0.2515 |
| age_group25-29 | -0.9202 | 0.5196 | -1.7709 | 0.0766 | -1.9388 | 0.0984 |
| age_group30-34 | -0.8924 | 0.5059 | -1.7640 | 0.0778 | -1.8841 | 0.0993 |
| age_group35-39 | -1.5929 | 0.4875 | -3.2673 | 0.0011 | -2.5485 | -0.6372 |
| age_group40-44 | -2.6286 | 0.4856 | -5.4127 | 0.0000 | -3.5806 | -1.6766 |
| age_group45-49 | -2.8425 | 0.4969 | -5.7209 | 0.0000 | -3.8165 | -1.8686 |
| age_group50-54 | -3.2778 | 0.4820 | -6.8012 | 0.0000 | -4.2226 | -2.3331 |
| age_group55-59 | -4.2499 | 0.4740 | -8.9652 | 0.0000 | -5.1791 | -3.3206 |
| age_group60-64 | -4.2640 | 0.4567 | -9.3364 | 0.0000 | -5.1593 | -3.3688 |
| age_group65-69 | -5.2506 | 0.4466 | -11.7563 | 0.0000 | -6.1261 | -4.3751 |
| age_group70-74 | -5.7111 | 0.4544 | -12.5686 | 0.0000 | -6.6018 | -4.8203 |
| age_group75-79 | -5.9076 | 0.4702 | -12.5646 | 0.0000 | -6.8292 | -4.9859 |
| age_group80+ | -6.4995 | 0.4736 | -13.7239 | 0.0000 | -7.4278 | -5.5711 |
| income$15,000 to < $25,000 | -1.6357 | 0.4700 | -3.4804 | 0.0005 | -2.5570 | -0.7144 |
| income$25,000 to < $35,000 | -2.0746 | 0.4486 | -4.6243 | 0.0000 | -2.9540 | -1.1952 |
| income$35,000 to < $50,000 | -2.5455 | 0.4392 | -5.7964 | 0.0000 | -3.4064 | -1.6847 |
| income$50,000 to < $100,000 | -3.0430 | 0.4116 | -7.3935 | 0.0000 | -3.8498 | -2.2362 |
| income$100,000 to < $200,000 | -3.5097 | 0.4300 | -8.1623 | 0.0000 | -4.3526 | -2.6668 |
| income$200,000 or more | -3.8405 | 0.5010 | -7.6651 | 0.0000 | -4.8226 | -2.8583 |
| educationHigh school graduate or GED | 0.1256 | 0.8124 | 0.1546 | 0.8772 | -1.4669 | 1.7180 |
| educationSome college or technical school | 1.1179 | 0.6912 | 1.6172 | 0.1059 | -0.2371 | 2.4729 |
| educationCollege graduate | 1.8179 | 0.6928 | 2.6239 | 0.0087 | 0.4598 | 3.1760 |
| educationGraduate or professional degree | 1.6691 | 0.6943 | 2.4040 | 0.0162 | 0.3081 | 3.0300 |
| sex_rerefFemale:smoking_rerefCurrent smoker | 1.2833 | 0.5171 | 2.4819 | 0.0131 | 0.2697 | 2.2968 |
anova(mlr_model_smoking_a, mlr_model_smoking_b) %>%
as.data.frame() %>%
rownames_to_column("Model") %>%
mutate(
Model = c("Full model", "Smoking interaction"),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Interaction Test: Does smoking interact with sex?",
col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| Full model | 7972 | 432508.9 | NA | NA | NA | NA |
| Smoking interaction | 7971 | 432174.9 | 1 | 333.9749 | 6.1598 | 0.0131 |
The null hypothesis is that there is no interaction between sex and smoking status. The alternative hypothesis is that there is an interaction between sex and smoking status. The F-statistic is 6.160 with 1 degree of freedom and a p-value of 0.0131. We can reject the null hypothesis and conclude that there is an interaction between sex and smoking status
The coefficient for male smokers is 1.521. This means that being a smoker is associated with having 1.521 more mentally unhealthy days among males.
The coefficient for female smokers is 2.804. This means that being a smoker is associated with having 2.804 more mentally unhealthy days among females.
The model using sex interacting with smoking shows that smoking among females is more strongly associated with poor mental health, as compared to males. While male smokers experienced 1.521 more days of poor mental health compared to non-smokers, female smokers experienced 2.804 more days or 1.283 more days.
pred_int <- ggpredict(mlr_model_smoking_b, terms = c("smoking_reref", "sex_reref"))
ggplot(pred_int, aes(x = x, y = predicted, color = group, fill = group)) +
geom_point() +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, color = NA) +
labs(
title = "Predicted Mental Health Days by Smoker Status and Sex",
subtitle = "From interaction model: smoker status * sex",
x = "Smoking Status",
y = "Predicted Mentally Unhealthy Days",
color = "Sex",
fill = "Sex"
) +
theme_minimal(base_size = 13) +
scale_color_brewer(palette = "Set1")
The graph shows the associations between smoking and sex on mentally
unhealthy days. The graph shows that the number of days with poor mental
health increases when men or women smoke compared to those who are
non-smokers. However, females who are smokers and non-smokers experience
more mentally unhealthy compared to their male counterparts. The
difference in poor mental health days is larger between men and women
who smoke, which shows that smoking impacts the number of days with poor
mental health for women more than men.
A. Findings and Limitations (6 points) What do the results suggest about the determinants of mental health among U.S. adults? The results of this assignment show that there are many factors involved in the mental health of U.S. adults. Age and income were heavily involved in the number of poor mental health days. For example, those aged 80 or more had 6.820 fewer days of poor mental health compared to those aged 18 to 24, adjusting for other variables. Also, those with an income of 200,000 or more had 3.784 fewer days of poor mental health compared to those with an income less than 15,000, adjusting for other variables. Many other variables such as smoking status, sex, bmi, and physical health were also associated with mental health outcomes. Education was not as clearly directly associated with mental health as income and age, with those gaining a high school degree or some college not fairing significantly better than those with less than a high school degree.
The data analyzed in this assignment is from a cross-sectional survey. This means that all data is presented from a snapshot of a sample population in 2023. The analysis cannot determine the direction of any associations claimed. Additional issues from the dataset could be from the responders being more likely to have poor mental health as a result of their voluntary inclusion in the survey. Additionally, confounders not controlled in these models include diet, alcohol or drug-use, and mental health diseases, which could cause spurious associations as these are likely involved in mental health among U.S. adults.
It’s important that this assignment reported adjusted R-squared in addition to simply R-squared as the adjusted value accounts for the additional complexity of the models. This means that additional covariates will be more heavily scrunitized in the adjusted R-squared values. This is important because adding as many covariates as possible will be penalized if they are not significantly relevant to the outcome. In addition, it’s important to test covariate importance in the models as chunks to compare how the model is impacted by the removal or addition of specific covariates. This is different from relying on individual t-tests as the model chunk tests will control for certain covariates, while testing to see if additional chunks of covariates improve this already defined model.