library(tidyverse)
library(haven)
library(janitor)
library(broom)
library(knitr)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)brfss_raw <- read_xpt("C:/Users/MY789914/OneDrive - University at Albany - SUNY/Desktop/Stat 553 (R)/Assignment 3/LLCP2023.XPT")
cat("Raw dataset dimensions:\n")
cat(" Rows: ", nrow(brfss_raw), "\n")
cat(" Columns:", ncol(brfss_raw), "\n")The 2023 BRFSS raw dataset includes 433,323 observations and 350 variables, with each row corresponding to a single respondent from U.S. states and territories surveyed in 2023.
library(haven)
library(dplyr)
library(janitor)
library(stringr)
brfss_raw_clean <- brfss_raw |>
janitor::clean_names()
# Diagnostic — verify exact names before selecting
cat("menthlth :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^menthlth$")], "\n")
cat("physhlth :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^physhlth$")], "\n")
cat("bmi5 :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^bmi5$")], "\n")
cat("sexvar :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^sexvar$")], "\n")
cat("exerany2 :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^exerany2$")], "\n")
cat("ageg5yr :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^ageg5yr$")], "\n")
cat("incomg1 :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^incomg1$")], "\n")
cat("educa :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^educa$")], "\n")
cat("smoker3 :", names(brfss_raw_clean)[stringr::str_detect(names(brfss_raw_clean), "^smoker3$")], "\n")
# Select required variables
brfss_selected <- brfss_raw_clean |>
dplyr::select(
menthlth, # was MENTHLTH
physhlth, # was PHYSHLTH
bmi5, # was _BMI5 → confirmed: bmi5
sexvar, # was SEXVAR
exerany2, # was EXERANY2
ageg5yr, # was _AGEG5YR → confirmed: ageg5yr
incomg1, # was _INCOMG1 → confirmed: incomg1
educa, # was EDUCA
smoker3 # was _SMOKER3 → confirmed: smoker3
)
# Recode all nine variables
brfss_clean <- brfss_selected |>
mutate(
# Mentally unhealthy days: 88=None->0; 77/99=DK/Refused->NA; 1-30 kept
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
menthlth %in% c(77, 99) ~ NA_real_,
TRUE ~ NA_real_
),
# Physically unhealthy days: same recoding rules
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
physhlth %in% c(77, 99) ~ NA_real_,
TRUE ~ NA_real_
),
# BMI: stored as BMI × 100; divide by 100 for actual value; 9999->NA
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
TRUE ~ as.numeric(bmi5) / 100
),
# Sex: 1=Male (reference), 2=Female
sex = factor(
case_when(
sexvar == 1 ~ "Male",
sexvar == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
),
# Exercise: 1=Yes, 2=No; 7/9->NA; No is reference
exercise = factor(
case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
exerany2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No", "Yes")
),
# Age group: 13 five-year bands; 14->NA; 18-24 is reference
age_group = factor(
case_when(
ageg5yr == 1 ~ "18-24",
ageg5yr == 2 ~ "25-29",
ageg5yr == 3 ~ "30-34",
ageg5yr == 4 ~ "35-39",
ageg5yr == 5 ~ "40-44",
ageg5yr == 6 ~ "45-49",
ageg5yr == 7 ~ "50-54",
ageg5yr == 8 ~ "55-59",
ageg5yr == 9 ~ "60-64",
ageg5yr == 10 ~ "65-69",
ageg5yr == 11 ~ "70-74",
ageg5yr == 12 ~ "75-79",
ageg5yr == 13 ~ "80+",
ageg5yr == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("18-24","25-29","30-34","35-39","40-44","45-49",
"50-54","55-59","60-64","65-69","70-74","75-79","80+")
),
# Income: 7 levels; 9->NA; "Less than $15,000" is reference
income = factor(
case_when(
incomg1 == 1 ~ "Less than $15,000",
incomg1 == 2 ~ "$15,000-$24,999",
incomg1 == 3 ~ "$25,000-$34,999",
incomg1 == 4 ~ "$35,000-$49,999",
incomg1 == 5 ~ "$50,000-$99,999",
incomg1 == 6 ~ "$100,000-$199,999",
incomg1 == 7 ~ "$200,000 or more",
incomg1 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $15,000","$15,000-$24,999","$25,000-$34,999",
"$35,000-$49,999","$50,000-$99,999",
"$100,000-$199,999","$200,000 or more")
),
# Education: 5 levels; 1-2 collapsed; 9->NA
education = factor(
case_when(
educa %in% c(1, 2) ~ "Less than high school",
educa == 3 ~ "High school diploma or GED",
educa == 4 ~ "Some college or technical school",
educa == 5 ~ "College graduate",
educa == 6 ~ "Graduate or professional degree",
educa == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than high school",
"High school diploma or GED",
"Some college or technical school",
"College graduate",
"Graduate or professional degree")
),
# Smoking status: 4 levels; 9->NA
smoking = factor(
case_when(
smoker3 == 1 ~ "Current daily smoker",
smoker3 == 2 ~ "Current some-day smoker",
smoker3 == 3 ~ "Former smoker",
smoker3 == 4 ~ "Never smoker",
smoker3 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Current daily smoker","Current some-day smoker",
"Former smoker","Never smoker")
)
) |>
dplyr::select(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking)
# Save file
saveRDS(brfss_clean, "C:/Users/MY789914/OneDrive - University at Albany - SUNY/Desktop/Stat 553 (R)/Assignment 3/brfss_clean.rds")
cat("Saved: brfss_clean.rds —", nrow(brfss_clean), "rows,",
ncol(brfss_clean), "columns\n")# Load from the saved RDS
brfss_clean <- readRDS("C:/Users/MY789914/OneDrive - University at Albany - SUNY/Desktop/Stat 553 (R)/Assignment 3/brfss_clean.rds")
cat("Clean dataset dimensions:\n")## Clean dataset dimensions:
## Rows: 433323
## Columns: 9
## Variables: menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking
miss_report <- brfss_clean |>
summarise(
across(
everything(),
list(
N_missing = ~ sum(is.na(.)),
Pct_missing = ~ round(mean(is.na(.)) * 100, 1)
),
.names = "{.col}__{.fn}"
)
) |>
pivot_longer(
everything(),
names_to = c("Variable", ".value"),
names_sep = "__"
) |>
rename(`N Missing` = N_missing, `% Missing` = Pct_missing)
miss_report |>
kable(
caption = "Table 1.0. Missing Data for All Nine Analysis Variables (Before Exclusions)",
col.names = c("Variable", "N Missing", "% Missing")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Variable | N Missing | % Missing |
|---|---|---|
| menthlth_days | 8108 | 1.9 |
| physhlth_days | 10785 | 2.5 |
| bmi | 40535 | 9.4 |
| sex | 0 | 0.0 |
| exercise | 1251 | 0.3 |
| age_group | 7779 | 1.8 |
| income | 86623 | 20.0 |
| education | 2325 | 0.5 |
| smoking | 23062 | 5.3 |
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
cat("Final analytic sample size: n =", nrow(brfss_analytic), "\n")## Final analytic sample size: n = 8000
Final analytic sample: After excluding observations with missing data
across the nine variables of interest and generating a reproducible
random sample using set.seed(1220), the resulting dataset
includes 8,000 observations.
#descriptive-table
brfss_analytic |>
dplyr::select(menthlth_days, physhlth_days, bmi, exercise,
age_group, income, education, smoking, sex) |>
tbl_summary(
by = sex,
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body Mass Index (kg/m²)",
exercise ~ "Any exercise in past 30 days",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Education level",
smoking ~ "Smoking status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_overall() |>
add_p() |>
bold_labels() |>
italicize_levels() |>
modify_caption(
"**Table 1.1. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000), Stratified by Sex**"
) |>
as_flex_table()Characteristic | Overall | Male | Female | p-value |
|---|---|---|---|---|
Mentally unhealthy days (past 30) | 4.3 (8.1) | 3.5 (7.5) | 5.1 (8.6) | |
Physically unhealthy days (past 30) | 4.4 (8.7) | 4.0 (8.4) | 4.9 (8.9) | |
Body Mass Index (kg/m²) | 28.7 (6.5) | 28.7 (6.0) | 28.7 (7.0) | |
Any exercise in past 30 days | 6,240 (78%) | 3,146 (80%) | 3,094 (76%) | |
Age group | ||||
18-24 | 406 (5.1%) | 235 (6.0%) | 171 (4.2%) | |
25-29 | 408 (5.1%) | 219 (5.6%) | 189 (4.7%) | |
30-34 | 463 (5.8%) | 253 (6.4%) | 210 (5.2%) | |
35-39 | 565 (7.1%) | 263 (6.7%) | 302 (7.4%) | |
40-44 | 582 (7.3%) | 290 (7.4%) | 292 (7.2%) | |
45-49 | 518 (6.5%) | 266 (6.8%) | 252 (6.2%) | |
50-54 | 608 (7.6%) | 305 (7.7%) | 303 (7.5%) | |
55-59 | 660 (8.3%) | 308 (7.8%) | 352 (8.7%) | |
60-64 | 787 (9.8%) | 408 (10%) | 379 (9.3%) | |
65-69 | 901 (11%) | 418 (11%) | 483 (12%) | |
70-74 | 808 (10%) | 382 (9.7%) | 426 (10%) | |
75-79 | 663 (8.3%) | 325 (8.3%) | 338 (8.3%) | |
80+ | 631 (7.9%) | 264 (6.7%) | 367 (9.0%) | |
Annual household income | ||||
Less than $15,000 | 407 (5.1%) | 160 (4.1%) | 247 (6.1%) | |
$15,000-$24,999 | 641 (8.0%) | 271 (6.9%) | 370 (9.1%) | |
$25,000-$34,999 | 871 (11%) | 376 (9.6%) | 495 (12%) | |
$35,000-$49,999 | 1,067 (13%) | 482 (12%) | 585 (14%) | |
$50,000-$99,999 | 2,511 (31%) | 1,251 (32%) | 1,260 (31%) | |
$100,000-$199,999 | 1,865 (23%) | 996 (25%) | 869 (21%) | |
$200,000 or more | 638 (8.0%) | 400 (10%) | 238 (5.9%) | |
Education level | ||||
Less than high school | 124 (1.6%) | 75 (1.9%) | 49 (1.2%) | |
High school diploma or GED | 252 (3.2%) | 130 (3.3%) | 122 (3.0%) | |
Some college or technical school | 1,827 (23%) | 950 (24%) | 877 (22%) | |
College graduate | 2,138 (27%) | 1,018 (26%) | 1,120 (28%) | |
Graduate or professional degree | 3,659 (46%) | 1,763 (45%) | 1,896 (47%) | |
Smoking status | ||||
Current daily smoker | 658 (8.2%) | 339 (8.6%) | 319 (7.8%) | |
Current some-day smoker | 268 (3.4%) | 151 (3.8%) | 117 (2.9%) | |
Former smoker | 2,262 (28%) | 1,207 (31%) | 1,055 (26%) | |
Never smoker | 4,812 (60%) | 2,239 (57%) | 2,573 (63%) | |
1Mean (SD); n (%) | ||||
mod_full <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
summary(mod_full)##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65053 0.95407 10.115 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000-$24,999 -1.63703 0.46899 -3.491 0.000485
## income$25,000-$34,999 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000-$49,999 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000-$99,999 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000-$199,999 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000-$24,999 ***
## income$25,000-$34,999 ***
## income$35,000-$49,999 ***
## income$50,000-$99,999 ***
## income$100,000-$199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
#coef-table
tidy(mod_full, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 1.2. Full Model Coefficient Estimates with 95% Confidence Intervals",
col.names = c("Term","Estimate","SE","t-statistic","p-value","95% CI Lower","95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
scroll_box(height = "420px")| Term | Estimate | SE | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 9.6505 | 0.9541 | 10.1151 | 0.0000 | 7.7803 | 11.5208 |
| physhlth_days | 0.2656 | 0.0101 | 26.3841 | 0.0000 | 0.2459 | 0.2853 |
| bmi | 0.0334 | 0.0132 | 2.5274 | 0.0115 | 0.0075 | 0.0593 |
| sexFemale | 1.3904 | 0.1675 | 8.3007 | 0.0000 | 1.0620 | 1.7187 |
| exerciseYes | -0.6512 | 0.2147 | -3.0326 | 0.0024 | -1.0721 | -0.2303 |
| age_group25-29 | -1.0566 | 0.5194 | -2.0343 | 0.0420 | -2.0747 | -0.0385 |
| age_group30-34 | -1.0939 | 0.5065 | -2.1600 | 0.0308 | -2.0867 | -0.1012 |
| age_group35-39 | -1.8110 | 0.4885 | -3.7072 | 0.0002 | -2.7686 | -0.8534 |
| age_group40-44 | -2.8931 | 0.4875 | -5.9346 | 0.0000 | -3.8487 | -1.9375 |
| age_group45-49 | -3.0562 | 0.4977 | -6.1408 | 0.0000 | -4.0318 | -2.0806 |
| age_group50-54 | -3.5167 | 0.4832 | -7.2775 | 0.0000 | -4.4640 | -2.5695 |
| age_group55-59 | -4.4960 | 0.4755 | -9.4543 | 0.0000 | -5.4282 | -3.5638 |
| age_group60-64 | -4.5221 | 0.4585 | -9.8633 | 0.0000 | -5.4209 | -3.6234 |
| age_group65-69 | -5.5779 | 0.4502 | -12.3903 | 0.0000 | -6.4604 | -4.6955 |
| age_group70-74 | -6.0254 | 0.4574 | -13.1728 | 0.0000 | -6.9220 | -5.1287 |
| age_group75-79 | -6.2866 | 0.4748 | -13.2392 | 0.0000 | -7.2174 | -5.3557 |
| age_group80+ | -6.8197 | 0.4768 | -14.3019 | 0.0000 | -7.7544 | -5.8850 |
| income$15,000-$24,999 | -1.6370 | 0.4690 | -3.4905 | 0.0005 | -2.5564 | -0.7177 |
| income$25,000-$34,999 | -2.0764 | 0.4480 | -4.6351 | 0.0000 | -2.9545 | -1.1982 |
| income$35,000-$49,999 | -2.5469 | 0.4382 | -5.8122 | 0.0000 | -3.4058 | -1.6879 |
| income$50,000-$99,999 | -3.0505 | 0.4107 | -7.4277 | 0.0000 | -3.8555 | -2.2454 |
| income$100,000-$199,999 | -3.4998 | 0.4292 | -8.1537 | 0.0000 | -4.3413 | -2.6584 |
| income$200,000 or more | -3.7841 | 0.5004 | -7.5628 | 0.0000 | -4.7649 | -2.8033 |
| educationHigh school diploma or GED | 0.0799 | 0.8107 | 0.0986 | 0.9215 | -1.5092 | 1.6690 |
| educationSome college or technical school | 1.0768 | 0.6897 | 1.5612 | 0.1185 | -0.2753 | 2.4288 |
| educationCollege graduate | 1.7909 | 0.6912 | 2.5911 | 0.0096 | 0.4360 | 3.1458 |
| educationGraduate or professional degree | 1.7378 | 0.6925 | 2.5095 | 0.0121 | 0.3803 | 3.0953 |
| smokingCurrent some-day smoker | -1.5867 | 0.5352 | -2.9645 | 0.0030 | -2.6359 | -0.5375 |
| smokingFormer smoker | -1.9897 | 0.3371 | -5.9020 | 0.0000 | -2.6506 | -1.3289 |
| smokingNever smoker | -2.9368 | 0.3216 | -9.1313 | 0.0000 | -3.5673 | -2.3063 |
## (Intercept)
## 9.651
## physhlth_days
## 0.266
## bmi
## 0.033
## sexFemale
## 1.390
## exerciseYes
## -0.651
## age_group25-29
## -1.057
## age_group30-34
## -1.094
## age_group35-39
## -1.811
## age_group40-44
## -2.893
## age_group45-49
## -3.056
## age_group50-54
## -3.517
## age_group55-59
## -4.496
## age_group60-64
## -4.522
## age_group65-69
## -5.578
## age_group70-74
## -6.025
## age_group75-79
## -6.287
## age_group80+
## -6.820
## income$15,000-$24,999
## -1.637
## income$25,000-$34,999
## -2.076
## income$35,000-$49,999
## -2.547
## income$50,000-$99,999
## -3.050
## income$100,000-$199,999
## -3.500
## income$200,000 or more
## -3.784
## educationHigh school diploma or GED
## 0.080
## educationSome college or technical school
## 1.077
## educationCollege graduate
## 1.791
## educationGraduate or professional degree
## 1.738
## smokingCurrent some-day smoker
## -1.587
## smokingFormer smoker
## -1.990
## smokingNever smoker
## -2.937
b0 <- coefs["(Intercept)"]
b_phys <- coefs["physhlth_days"]
b_bmi <- coefs["bmi"]
b_female <- coefs["sexFemale"]
b_exyes <- coefs["exerciseYes"]
b_age2529 <- coefs["age_group25-29"]
b_age3034 <- coefs["age_group30-34"]
b_age3539 <- coefs["age_group35-39"]
b_age4044 <- coefs["age_group40-44"]
b_age4549 <- coefs["age_group45-49"]
b_age5054 <- coefs["age_group50-54"]
b_age5559 <- coefs["age_group55-59"]
b_age6064 <- coefs["age_group60-64"]
b_age6569 <- coefs["age_group65-69"]
b_age7074 <- coefs["age_group70-74"]
b_age7579 <- coefs["age_group75-79"]
b_age80p <- coefs["age_group80+"]
b_inc15_24 <- coefs["income$15,000-$24,999"]
b_inc25_34 <- coefs["income$25,000-$34,999"]
b_inc35_49 <- coefs["income$35,000-$49,999"]
b_inc50_99 <- coefs["income$50,000-$99,999"]
b_inc100_199 <- coefs["income$100,000-$199,999"]
b_inc200p <- coefs["income$200,000 or more"]
b_ed_hs <- coefs["educationHigh school diploma or GED"]
b_ed_some <- coefs["educationSome college or technical school"]
b_ed_col <- coefs["educationCollege graduate"]
b_ed_grad <- coefs["educationGraduate or professional degree"]
b_sm_some <- coefs["smokingCurrent some-day smoker"]
b_sm_former <- coefs["smokingFormer smoker"]
b_sm_never <- coefs["smokingNever smoker"]The fitted regression equation (coefficients rounded to three decimal places) is:
\[ \begin{aligned} \widehat{\text{Mentally Unhealthy Days}} =\ & 9.651 \\ &+ 0.266 \cdot \text{PhysicalHealthDays} \\ &+ 0.033 \cdot \text{BMI} \\ &+ 1.39 \cdot \mathbf{1}(\text{Female}) \\ &+ -0.651 \cdot \mathbf{1}(\text{Exercise = Yes}) \\ &+ -1.057 \cdot \mathbf{1}(\text{25--29}) \\ &+ -1.094 \cdot \mathbf{1}(\text{30--34}) \\ &+ -1.811 \cdot \mathbf{1}(\text{35--39}) \\ &+ -2.893 \cdot \mathbf{1}(\text{40--44}) \\ &+ -3.056 \cdot \mathbf{1}(\text{45--49}) \\ &+ -3.517 \cdot \mathbf{1}(\text{50--54}) \\ &+ -4.496 \cdot \mathbf{1}(\text{55--59}) \\ &+ -4.522 \cdot \mathbf{1}(\text{60--64}) \\ &+ -5.578 \cdot \mathbf{1}(\text{65--69}) \\ &+ -6.025 \cdot \mathbf{1}(\text{70--74}) \\ &+ -6.287 \cdot \mathbf{1}(\text{75--79}) \\ &+ -6.82 \cdot \mathbf{1}(\text{80+}) \\ &+ -1.637 \cdot \mathbf{1}(\text{\$15,000--\$24,999}) \\ &+ -2.076 \cdot \mathbf{1}(\text{\$25,000--\$34,999}) \\ &+ -2.547 \cdot \mathbf{1}(\text{\$35,000--\$49,999}) \\ &+ -3.05 \cdot \mathbf{1}(\text{\$50,000--\$99,999}) \\ &+ -3.5 \cdot \mathbf{1}(\text{\$100,000--\$199,999}) \\ &+ -3.784 \cdot \mathbf{1}(\text{\$200,000 or more}) \\ &+ 0.08 \cdot \mathbf{1}(\text{High school diploma or GED}) \\ &+ 1.077 \cdot \mathbf{1}(\text{Some college or technical school}) \\ &+ 1.791 \cdot \mathbf{1}(\text{College graduate}) \\ &+ 1.738 \cdot \mathbf{1}(\text{Graduate or professional degree}) \\ &+ -1.587 \cdot \mathbf{1}(\text{Current some-day smoker}) \\ &+ -1.99 \cdot \mathbf{1}(\text{Former smoker}) \\ &+ -2.937 \cdot \mathbf{1}(\text{Never smoker}) \end{aligned} \]
physhlth_days: Each additional physically unhealthy day is associated with 0.27 more mentally unhealthy days, holding all other variables constant.
bmi: Each one-unit increase in BMI is associated with 0.03 more mentally unhealthy days, holding all other variables constant.
sex (Female vs. Male): Females report 1.39 more mentally unhealthy days than males, holding all other variables constant.
exercise (Yes vs. No): Those who exercised report 0.65 fewer mentally unhealthy days compared to those who did not, holding all other variables constant.
age_group (25–29 vs. 18–24): Adults aged 25–29 report 1.06 fewer mentally unhealthy days than those aged 18–24, holding all other variables constant.
income ($50,000–$99,999 vs. < $15,000): Individuals in this income group report 3.05 fewer mentally unhealthy days, holding all other variables constant.
income ($200,000+ vs. < $15,000): Individuals earning $200,000 or more report 3.78 fewer mentally unhealthy days, holding all other variables constant.
mod_stats <- glance(mod_full)
tibble(
Statistic = c(
"R-squared",
"Adjusted R-squared",
"Root MSE (Residual Standard Error)",
"Overall F-statistic",
"Numerator df (p)",
"Denominator df (n - p - 1)",
"Overall p-value"
),
Value = c(
round(mod_stats$r.squared, 4),
round(mod_stats$adj.r.squared, 4),
round(mod_stats$sigma, 3),
round(mod_stats$statistic, 2),
mod_stats$df,
mod_stats$df.residual,
format.pval(mod_stats$p.value, digits = 3)
)
) |>
kable(caption = "Table 1.3. Model-Level Summary Statistics") |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Statistic | Value |
|---|---|
| R-squared | 0.1853 |
| Adjusted R-squared | 0.1824 |
| Root MSE (Residual Standard Error) | 7.352 |
| Overall F-statistic | 62.52 |
| Numerator df (p) | 29 |
| Denominator df (n - p - 1) | 7970 |
| Overall p-value | <2e-16 |
Interpretation:
The R-squared is 0.1853, indicating that approximately 18.5% of the variation in mentally unhealthy days is explained by the predictors included in the model.
The adjusted R-squared is 0.1824, which is slightly lower because it accounts for the number of predictors in the model. This suggests that most variables contribute meaningfully, but some may have limited additional explanatory value.
The Root MSE (Residual Standard Error) is 7.352, meaning that the model’s predictions differ from the observed values by about 7.35 mentally unhealthy days on average.
The overall F-test evaluates whether the model provides a better fit than a model with no predictors. The null hypothesis (H₀) is that all regression coefficients (except the intercept) are equal to zero. In this model, F(29, 7970) = 62.52 with p < 2e-16, so we reject the null hypothesis and conclude that the model is statistically significant overall, meaning that at least one predictor is associated with mentally unhealthy days.
#type3-anova
anova_type3 <- Anova(mod_full, type = "III")
anova_type3 |>
tidy() |>
mutate(
across(where(is.numeric), ~ round(., 4)),
`Significant (alpha = 0.05)` = ifelse(p.value < 0.05, "Yes", "No")
) |>
kable(
caption = "Table 2.1. Type III Partial F-Tests — All Predictors",
col.names = c("Predictor","Sum of Sq","df","F value","p-value","Significant?")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
column_spec(6, bold = TRUE)| Predictor | Sum of Sq | df | F value | p-value | Significant? |
|---|---|---|---|---|---|
| (Intercept) | 5529.7722 | 1 | 102.3152 | 0.0000 | Yes |
| physhlth_days | 37622.7952 | 1 | 696.1198 | 0.0000 | Yes |
| bmi | 345.2408 | 1 | 6.3879 | 0.0115 | Yes |
| sex | 3723.8662 | 1 | 68.9012 | 0.0000 | Yes |
| exercise | 497.0434 | 1 | 9.1966 | 0.0024 | Yes |
| age_group | 30092.1774 | 12 | 46.3986 | 0.0000 | Yes |
| income | 4733.8943 | 6 | 14.5982 | 0.0000 | Yes |
| education | 1265.1504 | 4 | 5.8521 | 0.0001 | Yes |
| smoking | 5101.1076 | 3 | 31.4613 | 0.0000 | Yes |
| Residuals | 430750.0872 | 7970 | NA | NA | NA |
Predictors marked “Yes” make a statistically significant independent contribution to predicting mentally unhealthy days, after controlling for all other predictors simultaneously.
#chunk-test-income
mod_no_income <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + education + smoking,
data = brfss_analytic
)
ft_income <- anova(mod_no_income, mod_full)
ft_income |>
tidy() |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 2.2. Chunk Test: Does Income as a Group Improve the Model?",
col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking | 7976 | 435484.0 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking | 7970 | 430750.1 | 6 | 4733.894 | 14.5982 | 0 |
\[H_0: \beta_{\$15k-\$25k} = \beta_{\$25k-\$35k} = \beta_{\$35k-\$50k} = \beta_{\$50k-\$100k} = \beta_{\$100k-\$200k} = \beta_{\$200k+} = 0\] \[H_A: \text{At least one income coefficient} \neq 0\]
The F-statistic is 14.60 with 6 and 7,970 degrees of freedom, and a p < 0.001. We therefore reject the null hypothesis that all income coefficients are equal to zero. This indicates that income, as a group, significantly improves the model, and income categories collectively explain additional variation in mentally unhealthy days beyond the other predictors in the model.
mod_no_educ <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + smoking,
data = brfss_analytic
)
ft_educ <- anova(mod_no_educ, mod_full)
ft_educ |>
tidy() |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 2.3. Chunk Test: Does Education as a Group Improve the Model?",
col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking | 7974 | 432015.2 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking | 7970 | 430750.1 | 4 | 1265.15 | 5.8521 | 1e-04 |
f_educ <- round(ft_educ$F[2], 3)
p_educ <- ft_educ$`Pr(>F)`[2]
df1_educ <- ft_educ$Df[2]
df2_educ <- ft_educ$Res.Df[2]Hypothesis:
\[H_0: \beta_{\text{HS/GED}} = \beta_{\text{Some college}} = \beta_{\text{College grad}} = \beta_{\text{Grad degree}} = 0\] \[H_A: \text{At least one education coefficient} \neq 0\]
The F-statistic = 5.85 with 4 and 7,970 degrees of freedom, and a p < 0.001, therefore, we reject the H0. This indicates that education categories, as a group, significantly contribute to explaining variation in mentally unhealthy days, even after adjusting for physical health, BMI, sex, exercise, age, income, and smoking.
Type III partial F-tests indicate that physical health days, sex, exercise, age group, income, education, and smoking each independently contribute to predicting mentally unhealthy days after adjusting for all other variables. Physical health days and smoking appear to be among the strongest predictors. Chunk tests further show that even if some individual categories within variables like income and education are not significant, these variables still explain meaningful variation overall, supporting their importance at the group level.
#create-smoker-binary
brfss_analytic <- brfss_analytic |>
mutate(
smoker_current = factor(
case_when(
smoking %in% c("Current daily smoker",
"Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker",
"Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
),
levels = c("Non-smoker", "Current smoker") # Non-smoker is reference
)
)
table(brfss_analytic$smoker_current, useNA = "always")##
## Non-smoker Current smoker <NA>
## 7074 926 0
#fit-model-A
mod_A <- lm(
menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
tidy(mod_A, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.1. Model A — Main Effects Only",
col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
scroll_box(height = "360px")| Term | Estimate | SE | t | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.7553 | 0.9271 | 7.2869 | 0.0000 | 4.9381 | 8.5726 |
| physhlth_days | 0.2686 | 0.0101 | 26.6731 | 0.0000 | 0.2489 | 0.2884 |
| bmi | 0.0334 | 0.0132 | 2.5250 | 0.0116 | 0.0075 | 0.0593 |
| sexFemale | 1.3331 | 0.1673 | 7.9675 | 0.0000 | 1.0051 | 1.6611 |
| smoker_currentCurrent smoker | 2.1287 | 0.2712 | 7.8489 | 0.0000 | 1.5971 | 2.6604 |
| exerciseYes | -0.6725 | 0.2150 | -3.1274 | 0.0018 | -1.0940 | -0.2510 |
| age_group25-29 | -0.9149 | 0.5198 | -1.7602 | 0.0784 | -1.9338 | 0.1040 |
| age_group30-34 | -0.8823 | 0.5061 | -1.7434 | 0.0813 | -1.8743 | 0.1097 |
| age_group35-39 | -1.5810 | 0.4877 | -3.2421 | 0.0012 | -2.5369 | -0.6251 |
| age_group40-44 | -2.6157 | 0.4858 | -5.3847 | 0.0000 | -3.5680 | -1.6635 |
| age_group45-49 | -2.8246 | 0.4970 | -5.6836 | 0.0000 | -3.7988 | -1.8504 |
| age_group50-54 | -3.2600 | 0.4821 | -6.7628 | 0.0000 | -4.2050 | -2.3151 |
| age_group55-59 | -4.2301 | 0.4741 | -8.9219 | 0.0000 | -5.1595 | -3.3007 |
| age_group60-64 | -4.2484 | 0.4568 | -9.3000 | 0.0000 | -5.1439 | -3.3529 |
| age_group65-69 | -5.2338 | 0.4467 | -11.7163 | 0.0000 | -6.1095 | -4.3582 |
| age_group70-74 | -5.7023 | 0.4545 | -12.5457 | 0.0000 | -6.5933 | -4.8113 |
| age_group75-79 | -5.8977 | 0.4703 | -12.5401 | 0.0000 | -6.8197 | -4.9758 |
| age_group80+ | -6.4888 | 0.4737 | -13.6975 | 0.0000 | -7.4174 | -5.5602 |
| income$15,000-$24,999 | -1.6797 | 0.4698 | -3.5754 | 0.0004 | -2.6007 | -0.7588 |
| income$25,000-$34,999 | -2.1023 | 0.4486 | -4.6861 | 0.0000 | -2.9817 | -1.2229 |
| income$35,000-$49,999 | -2.5869 | 0.4390 | -5.8931 | 0.0000 | -3.4474 | -1.7264 |
| income$50,000-$99,999 | -3.0823 | 0.4114 | -7.4921 | 0.0000 | -3.8887 | -2.2758 |
| income$100,000-$199,999 | -3.5360 | 0.4300 | -8.2232 | 0.0000 | -4.3789 | -2.6931 |
| income$200,000 or more | -3.8625 | 0.5011 | -7.7078 | 0.0000 | -4.8448 | -2.8802 |
| educationHigh school diploma or GED | 0.2139 | 0.8119 | 0.2634 | 0.7922 | -1.3776 | 1.8053 |
| educationSome college or technical school | 1.1965 | 0.6907 | 1.7323 | 0.0833 | -0.1575 | 2.5505 |
| educationCollege graduate | 1.9035 | 0.6922 | 2.7499 | 0.0060 | 0.5466 | 3.2604 |
| educationGraduate or professional degree | 1.7456 | 0.6938 | 2.5160 | 0.0119 | 0.3856 | 3.1057 |
#fit-model-B
# includes sex, smoker_current, AND their interaction term
mod_B <- lm(
menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
tidy(mod_B, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.2. Model B — With sex x smoker_current Interaction",
col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
scroll_box(height = "400px")| Term | Estimate | SE | t | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.8994 | 0.9286 | 7.4301 | 0.0000 | 5.0791 | 8.7196 |
| physhlth_days | 0.2686 | 0.0101 | 26.6788 | 0.0000 | 0.2489 | 0.2883 |
| bmi | 0.0331 | 0.0132 | 2.5017 | 0.0124 | 0.0072 | 0.0590 |
| sexFemale | 1.1855 | 0.1775 | 6.6784 | 0.0000 | 0.8376 | 1.5335 |
| smoker_currentCurrent smoker | 1.5208 | 0.3654 | 4.1621 | 0.0000 | 0.8045 | 2.2371 |
| exerciseYes | -0.6728 | 0.2150 | -3.1301 | 0.0018 | -1.0942 | -0.2515 |
| age_group25-29 | -0.9202 | 0.5196 | -1.7709 | 0.0766 | -1.9388 | 0.0984 |
| age_group30-34 | -0.8924 | 0.5059 | -1.7640 | 0.0778 | -1.8841 | 0.0993 |
| age_group35-39 | -1.5929 | 0.4875 | -3.2673 | 0.0011 | -2.5485 | -0.6372 |
| age_group40-44 | -2.6286 | 0.4856 | -5.4127 | 0.0000 | -3.5806 | -1.6766 |
| age_group45-49 | -2.8425 | 0.4969 | -5.7209 | 0.0000 | -3.8165 | -1.8686 |
| age_group50-54 | -3.2778 | 0.4820 | -6.8012 | 0.0000 | -4.2226 | -2.3331 |
| age_group55-59 | -4.2499 | 0.4740 | -8.9652 | 0.0000 | -5.1791 | -3.3206 |
| age_group60-64 | -4.2640 | 0.4567 | -9.3364 | 0.0000 | -5.1593 | -3.3688 |
| age_group65-69 | -5.2506 | 0.4466 | -11.7563 | 0.0000 | -6.1261 | -4.3751 |
| age_group70-74 | -5.7111 | 0.4544 | -12.5686 | 0.0000 | -6.6018 | -4.8203 |
| age_group75-79 | -5.9076 | 0.4702 | -12.5646 | 0.0000 | -6.8292 | -4.9859 |
| age_group80+ | -6.4995 | 0.4736 | -13.7239 | 0.0000 | -7.4278 | -5.5711 |
| income$15,000-$24,999 | -1.6357 | 0.4700 | -3.4804 | 0.0005 | -2.5570 | -0.7144 |
| income$25,000-$34,999 | -2.0746 | 0.4486 | -4.6243 | 0.0000 | -2.9540 | -1.1952 |
| income$35,000-$49,999 | -2.5455 | 0.4392 | -5.7964 | 0.0000 | -3.4064 | -1.6847 |
| income$50,000-$99,999 | -3.0430 | 0.4116 | -7.3935 | 0.0000 | -3.8498 | -2.2362 |
| income$100,000-$199,999 | -3.5097 | 0.4300 | -8.1623 | 0.0000 | -4.3526 | -2.6668 |
| income$200,000 or more | -3.8405 | 0.5010 | -7.6651 | 0.0000 | -4.8226 | -2.8583 |
| educationHigh school diploma or GED | 0.1256 | 0.8124 | 0.1546 | 0.8772 | -1.4669 | 1.7180 |
| educationSome college or technical school | 1.1179 | 0.6912 | 1.6172 | 0.1059 | -0.2371 | 2.4729 |
| educationCollege graduate | 1.8179 | 0.6928 | 2.6239 | 0.0087 | 0.4598 | 3.1760 |
| educationGraduate or professional degree | 1.6691 | 0.6943 | 2.4040 | 0.0162 | 0.3081 | 3.0300 |
| sexFemale:smoker_currentCurrent smoker | 1.2833 | 0.5171 | 2.4819 | 0.0131 | 0.2697 | 2.2968 |
#interaction-test
int_test <- anova(mod_A, mod_B)
int_test |>
tidy() |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.3. Partial F-Test: Model A (no interaction) vs. Model B (with interaction)",
col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education | 7972 | 432508.9 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education | 7971 | 432174.9 | 1 | 333.9749 | 6.1598 | 0.0131 |
\[H_0: \beta_{\text{Female} \times \text{Current smoker}} = 0 \quad \text{(smoking–mental health association is the same for men and women)}\]
\[H_A: \beta_{\text{Female} \times \text{Current smoker}} \neq 0 \quad \text{(the association differs by sex)}\]
The F-statistic = 6.16 with p = 0.013 indicates that the interaction between sex and current smoking is statistically significant. Thus, we reject the null hypothesis and conclude that the relationship between current smoking and mentally unhealthy days differs by sex. In other words, the effect of smoking on mentally unhealthy days is not the same for males and females.
#Interpret interaction-coefs
b_B <- round(coef(mod_B), 3)
smoking_men <- b_B["smoker_currentCurrent smoker"]
smoking_women <- b_B["smoker_currentCurrent smoker"] +
b_B["sexFemale:smoker_currentCurrent smoker"]
cat("Estimated association of current smoking with mentally unhealthy days:\n")## Estimated association of current smoking with mentally unhealthy days:
## Among men : 1.521 days
## Among women : 2.804 days
## Difference : 1.283 days
Among men (reference sex group): Among men, current smoking is associated with 1.52 additional mentally unhealthy days compared to male non-smokers, holding all other predictors constant.
Among women: Among women, current smoking is associated with 2.80 additional mentally unhealthy days, holding all other variables constant.
Together: The difference between women and men is 1.28 days, indicating that current smoking has a stronger impact on mentally unhealthy days for women than for men.
#interaction-plot, Figure 3.1. Predicted Mentally Unhealthy Days by Smoking Status and Sex
pred_int <- ggpredict(mod_B, terms = c("smoker_current", "sex"))
# Convert to data frame for full control over the plot
pred_df <- as.data.frame(pred_int)
ggplot(pred_df, aes(x = x, y = predicted, color = group, group = group)) +
geom_point(size = 4, position = position_dodge(width = 0.3)) +
geom_line(linewidth = 1.2, position = position_dodge(width = 0.3)) +
geom_errorbar(
aes(ymin = conf.low, ymax = conf.high),
width = 0.15,
linewidth = 0.8,
position = position_dodge(width = 0.3)
) +
scale_color_brewer(palette = "Set1", name = "Sex") +
labs(
title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
subtitle = "Model B: sex x smoker_current interaction, adjusted for all other predictors",
x = "Smoking Status",
y = "Predicted Mentally Unhealthy Days (Past 30)",
caption = "Points = model-adjusted predictions; error bars = 95% confidence intervals."
) +
theme_minimal(base_size = 13) +
theme(legend.position = "right")Among U.S. adults, current smokers report more mentally unhealthy days on average than former or never smokers, even after adjusting for age, sex, income, education, physical health, and exercise. This association varies by sex, with women who smoke showing a greater increase in mentally unhealthy days compared to non-smoking women than the corresponding difference observed among men. These findings suggest that public health interventions targeting both smoking and mental health may benefit from sex-specific strategies. However, because the analysis is cross-sectional, the direction of causality cannot be determined—poor mental health may lead to smoking, smoking may worsen mental health, or both may stem from shared underlying factors.
Among U.S. adults, the number of mentally unhealthy days is linked to a combination of behavioral, health-related, and socioeconomic factors. Physical health stood out as the most influential predictor: each additional physically unhealthy day corresponded to about a 0.27-day increase in mentally unhealthy days, after accounting for other variables. Smoking was also strongly associated with poorer mental health, with current smokers reporting more mentally unhealthy days than former or never smokers, even after adjustment. The interaction analysis further showed that this relationship varies by sex, with smoking having a stronger impact among women than men.
Other variables also contributed to the pattern. Individuals who engaged in any exercise during the past 30 days reported fewer mentally unhealthy days on average compared to those who did not exercise. Age showed an inverse relationship, with older adults consistently reporting fewer mentally unhealthy days than younger adults. Socioeconomic factors, particularly income and education, demonstrated clear gradients: higher income and higher educational attainment were generally associated with better mental health outcomes. For instance, individuals earning between $50,000 and $99,999 reported substantially fewer mentally unhealthy days than those in the lowest income category. In contrast, BMI showed only a small association, suggesting a more limited or indirect role.
The model explained a modest portion of variability in the outcome, with an R-squared of 0.185 and an adjusted R-squared of 0.182. The minimal difference between these values suggests that the included predictors meaningfully contribute to the model without excessive overfitting. The overall model was statistically significant (F = 62.52, df = 29, 7970, p < 0.001), indicating that the predictors collectively improve the ability to explain mentally unhealthy days.
Group-level tests provided additional insight. Chunk tests showed that income and education significantly improved model fit when considered as sets of related categories, even if not every individual category was statistically significant. Similarly, including the interaction between sex and smoking improved the model, confirming that the association between smoking and mental health differs across sexes. Overall, these findings emphasize that mental health is shaped by a combination of behavioral, demographic, and socioeconomic influences, supporting the need for comprehensive and targeted public health approaches.
I used AI assistance (Chat GPT) to help troubleshoot R coding errors, particularly issues with the select() function and formatting regression equations in R Markdown. To verify accuracy, I cross-checked outputs with R results, ensured consistency with lecture materials, and confirmed that interpretations aligned with regression principles discussed in class.