library(tidyverse)
library(haven)
library(janitor)
library(broom)
library(knitr)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)brfss_raw <- read_xpt('/Users/emmanuelsmac/Desktop/LLCP2023.XPT ')
cat("Raw dataset dimensions:\n")
cat(" Rows: ", nrow(brfss_raw), "\n")
cat(" Columns:", ncol(brfss_raw), "\n")The raw 2023 BRFSS dataset contains 433,323 rows and 350 columns, representing one respondent per row across all U.S. states and territories for the 2023 survey year.
# ── CONFIRMED VARIABLE NAME MAPPING ──────────────────────────────────────────
# clean_names() in this version of janitor drops leading underscores entirely
# rather than replacing them with "x_". Exact names confirmed from your data:
#
# Raw BRFSS name → clean_names() output
# MENTHLTH → menthlth
# PHYSHLTH → physhlth
# _BMI5 → bmi5 (NOT x_bmi5)
# SEXVAR → sexvar
# EXERANY2 → exerany2
# _AGEG5YR → ageg5yr (NOT x_ageg5yr)
# _INCOMG1 → incomg1 (NOT x_incomg1)
# EDUCA → educa
# _SMOKER3 → smoker3 (NOT x_smoker3)
# ─────────────────────────────────────────────────────────────────────────────
brfss_raw_clean <- brfss_raw |>
clean_names()
# Diagnostic — verify exact names before selecting
cat("menthlth :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^menthlth$")], "\n")
cat("physhlth :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^physhlth$")], "\n")
cat("bmi5 :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^bmi5$")], "\n")
cat("sexvar :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^sexvar$")], "\n")
cat("exerany2 :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^exerany2$")], "\n")
cat("ageg5yr :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^ageg5yr$")], "\n")
cat("incomg1 :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^incomg1$")], "\n")
cat("educa :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^educa$")], "\n")
cat("smoker3 :", names(brfss_raw_clean)[str_detect(names(brfss_raw_clean), "^smoker3$")], "\n")
# ── Select the nine required variables ───────────────────────────────────────
brfss_selected <- brfss_raw_clean |>
select(
menthlth, # was MENTHLTH
physhlth, # was PHYSHLTH
bmi5, # was _BMI5 → confirmed: bmi5
sexvar, # was SEXVAR
exerany2, # was EXERANY2
ageg5yr, # was _AGEG5YR → confirmed: ageg5yr
incomg1, # was _INCOMG1 → confirmed: incomg1
educa, # was EDUCA
smoker3 # was _SMOKER3 → confirmed: smoker3
)
# ── Recode all nine variables ─────────────────────────────────────────────────
brfss_clean <- brfss_selected |>
mutate(
# Mentally unhealthy days: 88=None->0; 77/99=DK/Refused->NA; 1-30 kept
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
menthlth %in% c(77, 99) ~ NA_real_,
TRUE ~ NA_real_
),
# Physically unhealthy days: same recoding rules
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
physhlth %in% c(77, 99) ~ NA_real_,
TRUE ~ NA_real_
),
# BMI: stored as BMI × 100; divide by 100 for actual value; 9999->NA
bmi = case_when(
bmi5 == 9999 ~ NA_real_,
TRUE ~ as.numeric(bmi5) / 100
),
# Sex: 1=Male (reference), 2=Female
sex = factor(
case_when(
sexvar == 1 ~ "Male",
sexvar == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
),
# Exercise: 1=Yes, 2=No; 7/9->NA; No is reference
exercise = factor(
case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
exerany2 %in% c(7, 9) ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("No", "Yes")
),
# Age group: 13 five-year bands; 14->NA; 18-24 is reference
age_group = factor(
case_when(
ageg5yr == 1 ~ "18-24",
ageg5yr == 2 ~ "25-29",
ageg5yr == 3 ~ "30-34",
ageg5yr == 4 ~ "35-39",
ageg5yr == 5 ~ "40-44",
ageg5yr == 6 ~ "45-49",
ageg5yr == 7 ~ "50-54",
ageg5yr == 8 ~ "55-59",
ageg5yr == 9 ~ "60-64",
ageg5yr == 10 ~ "65-69",
ageg5yr == 11 ~ "70-74",
ageg5yr == 12 ~ "75-79",
ageg5yr == 13 ~ "80+",
ageg5yr == 14 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("18-24","25-29","30-34","35-39","40-44","45-49",
"50-54","55-59","60-64","65-69","70-74","75-79","80+")
),
# Income: 7 levels; 9->NA; "Less than $15,000" is reference
income = factor(
case_when(
incomg1 == 1 ~ "Less than $15,000",
incomg1 == 2 ~ "$15,000-$24,999",
incomg1 == 3 ~ "$25,000-$34,999",
incomg1 == 4 ~ "$35,000-$49,999",
incomg1 == 5 ~ "$50,000-$99,999",
incomg1 == 6 ~ "$100,000-$199,999",
incomg1 == 7 ~ "$200,000 or more",
incomg1 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than $15,000","$15,000-$24,999","$25,000-$34,999",
"$35,000-$49,999","$50,000-$99,999",
"$100,000-$199,999","$200,000 or more")
),
# Education: 5 levels; 1-2 collapsed; 9->NA
education = factor(
case_when(
educa %in% c(1, 2) ~ "Less than high school",
educa == 3 ~ "High school diploma or GED",
educa == 4 ~ "Some college or technical school",
educa == 5 ~ "College graduate",
educa == 6 ~ "Graduate or professional degree",
educa == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Less than high school",
"High school diploma or GED",
"Some college or technical school",
"College graduate",
"Graduate or professional degree")
),
# Smoking status: 4 levels; 9->NA
smoking = factor(
case_when(
smoker3 == 1 ~ "Current daily smoker",
smoker3 == 2 ~ "Current some-day smoker",
smoker3 == 3 ~ "Former smoker",
smoker3 == 4 ~ "Never smoker",
smoker3 == 9 ~ NA_character_,
TRUE ~ NA_character_
),
levels = c("Current daily smoker","Current some-day smoker",
"Former smoker","Never smoker")
)
) |>
select(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking)
# Save immediately — avoids reloading the 300MB XPT file in future sessions
saveRDS(brfss_clean, '/Users/emmanuelsmac/Desktop/LLCP2023.XPT ')
cat("Saved: brfss_hw3_clean.rds —", nrow(brfss_clean), "rows,",
ncol(brfss_clean), "columns\n")# Load from the saved RDS — runs in seconds, no need to reload the XPT file.
brfss_clean <- readRDS('/Users/emmanuelsmac/Desktop/LLCP2023.XPT ')
cat("Clean dataset dimensions:\n")## Clean dataset dimensions:
## Rows: 433323
## Columns: 9
## Variables: menthlth_days, physhlth_days, bmi, sex, exercise, age_group, income, education, smoking
miss_report <- brfss_clean |>
summarise(
across(
everything(),
list(
N_missing = ~ sum(is.na(.)),
Pct_missing = ~ round(mean(is.na(.)) * 100, 1)
),
.names = "{.col}__{.fn}"
)
) |>
pivot_longer(
everything(),
names_to = c("Variable", ".value"),
names_sep = "__"
) |>
rename(`N Missing` = N_missing, `% Missing` = Pct_missing)
miss_report |>
kable(
caption = "Table 0.1. Missing Data for All Nine Analysis Variables (Before Exclusions)",
col.names = c("Variable", "N Missing", "% Missing")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Variable | N Missing | % Missing |
|---|---|---|
| menthlth_days | 8108 | 1.9 |
| physhlth_days | 10785 | 2.5 |
| bmi | 40535 | 9.4 |
| sex | 0 | 0.0 |
| exercise | 1251 | 0.3 |
| age_group | 7779 | 1.8 |
| income | 86623 | 20.0 |
| education | 2325 | 0.5 |
| smoking | 23062 | 5.3 |
set.seed(1220)
brfss_analytic <- brfss_clean |>
drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
age_group, income, education, smoking) |>
slice_sample(n = 8000)
cat("Final analytic sample size: n =", nrow(brfss_analytic), "\n")## Final analytic sample size: n = 8000
Final analytic sample: After removing observations with missing
values on any of the nine analysis variables and drawing a reproducible
random sample using set.seed(1220), the analytic dataset
contains n = 8,000 observations.
brfss_analytic |>
select(menthlth_days, physhlth_days, bmi, exercise,
age_group, income, education, smoking, sex) |>
tbl_summary(
by = sex,
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
bmi ~ "Body Mass Index (kg/m²)",
exercise ~ "Any exercise in past 30 days",
age_group ~ "Age group",
income ~ "Annual household income",
education ~ "Education level",
smoking ~ "Smoking status"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) |>
add_overall() |>
add_p() |>
bold_labels() |>
italicize_levels() |>
modify_caption(
"**Table 0.2. Descriptive Statistics — BRFSS 2023 Analytic Sample (n = 8,000), Stratified by Sex**"
) |>
as_flex_table()Characteristic | Overall | Male | Female | p-value2 |
|---|---|---|---|---|
Mentally unhealthy days (past 30) | 4.3 (8.1) | 3.5 (7.5) | 5.1 (8.6) | <0.001 |
Physically unhealthy days (past 30) | 4.4 (8.7) | 4.0 (8.4) | 4.9 (8.9) | <0.001 |
Body Mass Index (kg/m²) | 28.7 (6.5) | 28.7 (6.0) | 28.7 (7.0) | 0.008 |
Any exercise in past 30 days | 6,240 (78%) | 3,146 (80%) | 3,094 (76%) | <0.001 |
Age group | <0.001 | |||
18-24 | 406 (5.1%) | 235 (6.0%) | 171 (4.2%) | |
25-29 | 408 (5.1%) | 219 (5.6%) | 189 (4.7%) | |
30-34 | 463 (5.8%) | 253 (6.4%) | 210 (5.2%) | |
35-39 | 565 (7.1%) | 263 (6.7%) | 302 (7.4%) | |
40-44 | 582 (7.3%) | 290 (7.4%) | 292 (7.2%) | |
45-49 | 518 (6.5%) | 266 (6.8%) | 252 (6.2%) | |
50-54 | 608 (7.6%) | 305 (7.7%) | 303 (7.5%) | |
55-59 | 660 (8.3%) | 308 (7.8%) | 352 (8.7%) | |
60-64 | 787 (9.8%) | 408 (10%) | 379 (9.3%) | |
65-69 | 901 (11%) | 418 (11%) | 483 (12%) | |
70-74 | 808 (10%) | 382 (9.7%) | 426 (10%) | |
75-79 | 663 (8.3%) | 325 (8.3%) | 338 (8.3%) | |
80+ | 631 (7.9%) | 264 (6.7%) | 367 (9.0%) | |
Annual household income | <0.001 | |||
Less than $15,000 | 407 (5.1%) | 160 (4.1%) | 247 (6.1%) | |
$15,000-$24,999 | 641 (8.0%) | 271 (6.9%) | 370 (9.1%) | |
$25,000-$34,999 | 871 (11%) | 376 (9.6%) | 495 (12%) | |
$35,000-$49,999 | 1,067 (13%) | 482 (12%) | 585 (14%) | |
$50,000-$99,999 | 2,511 (31%) | 1,251 (32%) | 1,260 (31%) | |
$100,000-$199,999 | 1,865 (23%) | 996 (25%) | 869 (21%) | |
$200,000 or more | 638 (8.0%) | 400 (10%) | 238 (5.9%) | |
Education level | 0.003 | |||
Less than high school | 124 (1.6%) | 75 (1.9%) | 49 (1.2%) | |
High school diploma or GED | 252 (3.2%) | 130 (3.3%) | 122 (3.0%) | |
Some college or technical school | 1,827 (23%) | 950 (24%) | 877 (22%) | |
College graduate | 2,138 (27%) | 1,018 (26%) | 1,120 (28%) | |
Graduate or professional degree | 3,659 (46%) | 1,763 (45%) | 1,896 (47%) | |
Smoking status | <0.001 | |||
Current daily smoker | 658 (8.2%) | 339 (8.6%) | 319 (7.8%) | |
Current some-day smoker | 268 (3.4%) | 151 (3.8%) | 117 (2.9%) | |
Former smoker | 2,262 (28%) | 1,207 (31%) | 1,055 (26%) | |
Never smoker | 4,812 (60%) | 2,239 (57%) | 2,573 (63%) | |
1Mean (SD); n (%) | ||||
2Wilcoxon rank sum test; Pearson's Chi-squared test | ||||
mod_full <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + education + smoking,
data = brfss_analytic
)
summary(mod_full)##
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise +
## age_group + income + education + smoking, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.080 -3.865 -1.597 0.712 30.471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.65053 0.95407 10.115 < 2e-16
## physhlth_days 0.26558 0.01007 26.384 < 2e-16
## bmi 0.03338 0.01321 2.527 0.011510
## sexFemale 1.39038 0.16750 8.301 < 2e-16
## exerciseYes -0.65116 0.21472 -3.033 0.002432
## age_group25-29 -1.05660 0.51938 -2.034 0.041950
## age_group30-34 -1.09395 0.50646 -2.160 0.030803
## age_group35-39 -1.81103 0.48851 -3.707 0.000211
## age_group40-44 -2.89307 0.48749 -5.935 3.07e-09
## age_group45-49 -3.05618 0.49769 -6.141 8.61e-10
## age_group50-54 -3.51674 0.48323 -7.277 3.72e-13
## age_group55-59 -4.49597 0.47555 -9.454 < 2e-16
## age_group60-64 -4.52215 0.45848 -9.863 < 2e-16
## age_group65-69 -5.57795 0.45019 -12.390 < 2e-16
## age_group70-74 -6.02536 0.45741 -13.173 < 2e-16
## age_group75-79 -6.28656 0.47484 -13.239 < 2e-16
## age_group80+ -6.81968 0.47684 -14.302 < 2e-16
## income$15,000-$24,999 -1.63703 0.46899 -3.491 0.000485
## income$25,000-$34,999 -2.07637 0.44797 -4.635 3.63e-06
## income$35,000-$49,999 -2.54685 0.43819 -5.812 6.40e-09
## income$50,000-$99,999 -3.05048 0.41069 -7.428 1.22e-13
## income$100,000-$199,999 -3.49984 0.42923 -8.154 4.07e-16
## income$200,000 or more -3.78409 0.50036 -7.563 4.38e-14
## educationHigh school diploma or GED 0.07991 0.81066 0.099 0.921484
## educationSome college or technical school 1.07679 0.68973 1.561 0.118520
## educationCollege graduate 1.79091 0.69119 2.591 0.009585
## educationGraduate or professional degree 1.73781 0.69250 2.509 0.012111
## smokingCurrent some-day smoker -1.58670 0.53523 -2.965 0.003041
## smokingFormer smoker -1.98971 0.33713 -5.902 3.74e-09
## smokingNever smoker -2.93681 0.32162 -9.131 < 2e-16
##
## (Intercept) ***
## physhlth_days ***
## bmi *
## sexFemale ***
## exerciseYes **
## age_group25-29 *
## age_group30-34 *
## age_group35-39 ***
## age_group40-44 ***
## age_group45-49 ***
## age_group50-54 ***
## age_group55-59 ***
## age_group60-64 ***
## age_group65-69 ***
## age_group70-74 ***
## age_group75-79 ***
## age_group80+ ***
## income$15,000-$24,999 ***
## income$25,000-$34,999 ***
## income$35,000-$49,999 ***
## income$50,000-$99,999 ***
## income$100,000-$199,999 ***
## income$200,000 or more ***
## educationHigh school diploma or GED
## educationSome college or technical school
## educationCollege graduate **
## educationGraduate or professional degree *
## smokingCurrent some-day smoker **
## smokingFormer smoker ***
## smokingNever smoker ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared: 0.1853, Adjusted R-squared: 0.1824
## F-statistic: 62.52 on 29 and 7970 DF, p-value: < 2.2e-16
tidy(mod_full, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 1.1. Full Model Coefficient Estimates with 95% Confidence Intervals",
col.names = c("Term","Estimate","SE","t-statistic","p-value","95% CI Lower","95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
scroll_box(height = "420px")| Term | Estimate | SE | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 9.6505 | 0.9541 | 10.1151 | 0.0000 | 7.7803 | 11.5208 |
| physhlth_days | 0.2656 | 0.0101 | 26.3841 | 0.0000 | 0.2459 | 0.2853 |
| bmi | 0.0334 | 0.0132 | 2.5274 | 0.0115 | 0.0075 | 0.0593 |
| sexFemale | 1.3904 | 0.1675 | 8.3007 | 0.0000 | 1.0620 | 1.7187 |
| exerciseYes | -0.6512 | 0.2147 | -3.0326 | 0.0024 | -1.0721 | -0.2303 |
| age_group25-29 | -1.0566 | 0.5194 | -2.0343 | 0.0420 | -2.0747 | -0.0385 |
| age_group30-34 | -1.0939 | 0.5065 | -2.1600 | 0.0308 | -2.0867 | -0.1012 |
| age_group35-39 | -1.8110 | 0.4885 | -3.7072 | 0.0002 | -2.7686 | -0.8534 |
| age_group40-44 | -2.8931 | 0.4875 | -5.9346 | 0.0000 | -3.8487 | -1.9375 |
| age_group45-49 | -3.0562 | 0.4977 | -6.1408 | 0.0000 | -4.0318 | -2.0806 |
| age_group50-54 | -3.5167 | 0.4832 | -7.2775 | 0.0000 | -4.4640 | -2.5695 |
| age_group55-59 | -4.4960 | 0.4755 | -9.4543 | 0.0000 | -5.4282 | -3.5638 |
| age_group60-64 | -4.5221 | 0.4585 | -9.8633 | 0.0000 | -5.4209 | -3.6234 |
| age_group65-69 | -5.5779 | 0.4502 | -12.3903 | 0.0000 | -6.4604 | -4.6955 |
| age_group70-74 | -6.0254 | 0.4574 | -13.1728 | 0.0000 | -6.9220 | -5.1287 |
| age_group75-79 | -6.2866 | 0.4748 | -13.2392 | 0.0000 | -7.2174 | -5.3557 |
| age_group80+ | -6.8197 | 0.4768 | -14.3019 | 0.0000 | -7.7544 | -5.8850 |
| income$15,000-$24,999 | -1.6370 | 0.4690 | -3.4905 | 0.0005 | -2.5564 | -0.7177 |
| income$25,000-$34,999 | -2.0764 | 0.4480 | -4.6351 | 0.0000 | -2.9545 | -1.1982 |
| income$35,000-$49,999 | -2.5469 | 0.4382 | -5.8122 | 0.0000 | -3.4058 | -1.6879 |
| income$50,000-$99,999 | -3.0505 | 0.4107 | -7.4277 | 0.0000 | -3.8555 | -2.2454 |
| income$100,000-$199,999 | -3.4998 | 0.4292 | -8.1537 | 0.0000 | -4.3413 | -2.6584 |
| income$200,000 or more | -3.7841 | 0.5004 | -7.5628 | 0.0000 | -4.7649 | -2.8033 |
| educationHigh school diploma or GED | 0.0799 | 0.8107 | 0.0986 | 0.9215 | -1.5092 | 1.6690 |
| educationSome college or technical school | 1.0768 | 0.6897 | 1.5612 | 0.1185 | -0.2753 | 2.4288 |
| educationCollege graduate | 1.7909 | 0.6912 | 2.5911 | 0.0096 | 0.4360 | 3.1458 |
| educationGraduate or professional degree | 1.7378 | 0.6925 | 2.5095 | 0.0121 | 0.3803 | 3.0953 |
| smokingCurrent some-day smoker | -1.5867 | 0.5352 | -2.9645 | 0.0030 | -2.6359 | -0.5375 |
| smokingFormer smoker | -1.9897 | 0.3371 | -5.9020 | 0.0000 | -2.6506 | -1.3289 |
| smokingNever smoker | -2.9368 | 0.3216 | -9.1313 | 0.0000 | -3.5673 | -2.3063 |
## (Intercept)
## 9.651
## physhlth_days
## 0.266
## bmi
## 0.033
## sexFemale
## 1.390
## exerciseYes
## -0.651
## age_group25-29
## -1.057
## age_group30-34
## -1.094
## age_group35-39
## -1.811
## age_group40-44
## -2.893
## age_group45-49
## -3.056
## age_group50-54
## -3.517
## age_group55-59
## -4.496
## age_group60-64
## -4.522
## age_group65-69
## -5.578
## age_group70-74
## -6.025
## age_group75-79
## -6.287
## age_group80+
## -6.820
## income$15,000-$24,999
## -1.637
## income$25,000-$34,999
## -2.076
## income$35,000-$49,999
## -2.547
## income$50,000-$99,999
## -3.050
## income$100,000-$199,999
## -3.500
## income$200,000 or more
## -3.784
## educationHigh school diploma or GED
## 0.080
## educationSome college or technical school
## 1.077
## educationCollege graduate
## 1.791
## educationGraduate or professional degree
## 1.738
## smokingCurrent some-day smoker
## -1.587
## smokingFormer smoker
## -1.990
## smokingNever smoker
## -2.937
The fitted regression equation (coefficients rounded to three decimal places) is:
\[ \begin{aligned} \widehat{\text{MentalHealthDays}} =\ &9.651 \\ &+ 0.266 \cdot \text{PhysicalHealthDays} \\ &+ 0.033 \cdot \text{BMI} \\ &+ 1.39 \cdot \mathbb{1}[\text{Female}] \\ &+ -0.651 \cdot \mathbb{1}[\text{Exercise=Yes}] \\ &+ -1.057 \cdot \mathbb{1}[\text{25-29}] + -1.094 \cdot \mathbb{1}[\text{30-34}] + -1.811 \cdot \mathbb{1}[\text{35-39}] \\ &+ -2.893 \cdot \mathbb{1}[\text{40-44}] + -3.056 \cdot \mathbb{1}[\text{45-49}] + -3.517 \cdot \mathbb{1}[\text{50-54}] \\ &+ -4.496 \cdot \mathbb{1}[\text{55-59}] + -4.522 \cdot \mathbb{1}[\text{60-64}] + -5.578 \cdot \mathbb{1}[\text{65-69}] \\ &+ -6.025 \cdot \mathbb{1}[\text{70-74}] + -6.287 \cdot \mathbb{1}[\text{75-79}] + -6.82 \cdot \mathbb{1}[\text{80+}] \\ &+ -1.637 \cdot \mathbb{1}[\text{\$15k-\$25k}] + -2.076 \cdot \mathbb{1}[\text{\$25k-\$35k}] + -2.547 \cdot \mathbb{1}[\text{\$35k-\$50k}] \\ &+ -3.05 \cdot \mathbb{1}[\text{\$50k-\$100k}] + -3.5 \cdot \mathbb{1}[\text{\$100k-\$200k}] + -3.784 \cdot \mathbb{1}[\text{\$200k+}] \\ &+ 0.08 \cdot \mathbb{1}[\text{HS/GED}] + 1.077 \cdot \mathbb{1}[\text{Some college}] \\ &+ 1.791 \cdot \mathbb{1}[\text{College grad}] + 1.738 \cdot \mathbb{1}[\text{Grad degree}] \\ &+ -1.587 \cdot \mathbb{1}[\text{Some-day smoker}] + -1.99 \cdot \mathbb{1}[\text{Former smoker}] + -2.937 \cdot \mathbb{1}[\text{Never smoker}] \end{aligned} \]
Step 4: Coefficient Interpretations
physhlth_days (Continuous)
Each additional day of physical illness in the past 30 days is associated with an estimated0.266 additional mentally unhealthy day on average, holding all other predictors (BMI, sex, exercise, age group, income, education, and smoking) constant. This is the strongest continuous predictor in the model.
bmi (Continuous)
Each one-unit increase in BMI (kg/m²) is associated with an estimated change of 0.033 mentally unhealthy days, holding all other predictors constant. The small magnitude suggests a modest independent association after adjusting for the other variables.
sex: Female vs. Male (Reference)
Females report an estimated 1.39 more mentally unhealthy days on average compared to males (the reference group), holding all other variables constant.
exercise: Yes vs. No (Reference)
Adults who reported any exercise in the past 30 days report an estimated 0.651 fewer mentally unhealthy days compared to those reporting no exercise, holding all other variables constant.
Age Group: 25–29 vs. 18–24 (Reference)
Adults aged 25–29 report an estimated 1.057 fewer mentally unhealthy days compared to those aged 18–24 (the reference group), holding all other predictors constant.
Income: Two Coefficients vs. Less than $15,000 (Reference)
$50,000–$99,999 vs. Less than $15,000:
Adults earning $50,000–$99,999 annually report an estimated 3.05 fewer mentally unhealthy days compared to those earning less than $15,000, holding all other variables constant.
$200,000 or more vs. Less than $15,000:
Adults earning $200,000 or more report an estimated 3.784 fewer mentally unhealthy days compared to those earning less than $15,000, holding all other variables constant. This reflects the persistent socioeconomic gradient in mental health.
mod_stats <- glance(mod_full)
tibble(
Statistic = c(
"R-squared",
"Adjusted R-squared",
"Root MSE (Residual Standard Error)",
"Overall F-statistic",
"Numerator df (p)",
"Denominator df (n - p - 1)",
"Overall p-value"
),
Value = c(
round(mod_stats$r.squared, 4),
round(mod_stats$adj.r.squared, 4),
round(mod_stats$sigma, 3),
round(mod_stats$statistic, 2),
mod_stats$df,
mod_stats$df.residual,
format.pval(mod_stats$p.value, digits = 3)
)
) |>
kable(caption = "Table 1.2. Model-Level Summary Statistics") |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Statistic | Value |
|---|---|
| R-squared | 0.1853 |
| Adjusted R-squared | 0.1824 |
| Root MSE (Residual Standard Error) | 7.352 |
| Overall F-statistic | 62.52 |
| Numerator df (p) | 29 |
| Denominator df (n - p - 1) | 7970 |
| Overall p-value | <2e-16 |
R-squared
\(R^2\) =0.1853: approximately 18.5% of the total variability in mentally unhealthy days is explained by all predictors combined.
Adjusted R-squared
Adjusted \(R^2\) =0.1824, slightly lower than \(R^2\). Adjusted \(R^2\) penalizes the model for each additional predictor added, correcting for the mechanical increase in \(R^2\) that occurs even when predictors add no real information. The small gap confirms the predictors are genuinely contributing.
Root MSE
The residual standard error is 7.352 days — the typical absolute distance between an observed and a model-predicted value of mentally unhealthy days.
Overall F-test
\[H_0: \beta_1 = \beta_2 = \cdots = \beta_p = 0 \quad \text{(no predictor is associated with mentally unhealthy days)}\] \[H_A: \text{At least one } \beta_j \neq 0\]
\(F(29, 7970) = 62.52\), \(p < 0.001\). We reject \(H_0\) and conclude that at least one predictor is significantly associated with mentally unhealthy days.
anova_type3 <- Anova(mod_full, type = "III")
anova_type3 |>
tidy() |>
mutate(
across(where(is.numeric), ~ round(., 4)),
`Significant (alpha = 0.05)` = ifelse(p.value < 0.05, "Yes", "No")
) |>
kable(
caption = "Table 2.1. Type III Partial F-Tests — All Predictors",
col.names = c("Predictor","Sum of Sq","df","F value","p-value","Significant?")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
column_spec(6, bold = TRUE)| Predictor | Sum of Sq | df | F value | p-value | Significant? |
|---|---|---|---|---|---|
| (Intercept) | 5529.7722 | 1 | 102.3152 | 0.0000 | Yes |
| physhlth_days | 37622.7952 | 1 | 696.1198 | 0.0000 | Yes |
| bmi | 345.2408 | 1 | 6.3879 | 0.0115 | Yes |
| sex | 3723.8662 | 1 | 68.9012 | 0.0000 | Yes |
| exercise | 497.0434 | 1 | 9.1966 | 0.0024 | Yes |
| age_group | 30092.1774 | 12 | 46.3986 | 0.0000 | Yes |
| income | 4733.8943 | 6 | 14.5982 | 0.0000 | Yes |
| education | 1265.1504 | 4 | 5.8521 | 0.0001 | Yes |
| smoking | 5101.1076 | 3 | 31.4613 | 0.0000 | Yes |
| Residuals | 430750.0872 | 7970 | NA | NA | NA |
Predictors marked Yes make a statistically significant independent contribution to predicting mentally unhealthy days, after adjusting for all other predictors simultaneously.
mod_no_income <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + education + smoking,
data = brfss_analytic
)
ft_income <- anova(mod_no_income, mod_full)
ft_income |>
tidy() |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 2.2. Chunk Test: Does Income as a Group Improve the Model?",
col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + education + smoking | 7976 | 435484.0 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking | 7970 | 430750.1 | 6 | 4733.894 | 14.5982 | 0 |
\[H_0: \beta_{\$15k-\$25k} = \beta_{\$25k-\$35k} = \beta_{\$35k-\$50k} = \beta_{\$50k-\$100k} = \beta_{\$100k-\$200k} = \beta_{\$200k+} = 0\] \[H_A: \text{At least one income coefficient} \neq 0\]
\(F(6, 7970) = 14.598\), \(p < 0.001\). We reject \(H_0\) at \(\alpha = 0.05\). Income as a group adds statistically significant explanatory value for mentally unhealthy days beyond all other predictors.
mod_no_educ <- lm(
menthlth_days ~ physhlth_days + bmi + sex + exercise +
age_group + income + smoking,
data = brfss_analytic
)
ft_educ <- anova(mod_no_educ, mod_full)
ft_educ |>
tidy() |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 2.3. Chunk Test: Does Education as a Group Improve the Model?",
col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + smoking | 7974 | 432015.2 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + income + education + smoking | 7970 | 430750.1 | 4 | 1265.15 | 5.8521 | 1e-04 |
f_educ <- round(ft_educ$F[2], 3)
p_educ <- ft_educ$`Pr(>F)`[2]
df1_educ <- ft_educ$Df[2]
df2_educ <- ft_educ$Res.Df[2]Hypotheses:
\[H_0: \beta_{\text{HS/GED}} = \beta_{\text{Some college}} = \beta_{\text{College grad}} = \beta_{\text{Grad degree}} = 0\] \[H_A: \text{At least one education coefficient} \neq 0\]
\(F(4, 7970) = 5.852\), \(p < 0.001\). We reject \(H_0\) at \(\alpha = 0.05\). Education as a group adds statistically significant explanatory value for mentally unhealthy days beyond all other predictors.
The Type III partial F-tests confirm that physical health days, sex,
exercise, age group, income, education, and smoking each make
statistically significant independent contributions to predicting
mentally unhealthy days after adjusting for all other predictors
simultaneously. Physical health days and smoking status were among the
most substantively notable predictors. The chunk tests for income and
education extend the individual t-test results in an important way: even
when individual dummy coefficients within a multi-level categorical
variable do not all reach significance on their own which can happen
when adjacent categories are closely spaced the variable as a whole may
still explain meaningful variation. Testing income and education
collectively with anova() confirms both variables
contribute significant group-level predictive value, providing stronger
justification for their inclusion than relying on any single dummy
coefficient alone.
brfss_analytic <- brfss_analytic |>
mutate(
smoker_current = factor(
case_when(
smoking %in% c("Current daily smoker",
"Current some-day smoker") ~ "Current smoker",
smoking %in% c("Former smoker",
"Never smoker") ~ "Non-smoker",
TRUE ~ NA_character_
),
levels = c("Non-smoker", "Current smoker") # Non-smoker is reference
)
)
table(brfss_analytic$smoker_current, useNA = "always")##
## Non-smoker Current smoker <NA>
## 7074 926 0
mod_A <- lm(
menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
tidy(mod_A, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.1. Model A — Main Effects Only",
col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
scroll_box(height = "360px")| Term | Estimate | SE | t | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.7553 | 0.9271 | 7.2869 | 0.0000 | 4.9381 | 8.5726 |
| physhlth_days | 0.2686 | 0.0101 | 26.6731 | 0.0000 | 0.2489 | 0.2884 |
| bmi | 0.0334 | 0.0132 | 2.5250 | 0.0116 | 0.0075 | 0.0593 |
| sexFemale | 1.3331 | 0.1673 | 7.9675 | 0.0000 | 1.0051 | 1.6611 |
| smoker_currentCurrent smoker | 2.1287 | 0.2712 | 7.8489 | 0.0000 | 1.5971 | 2.6604 |
| exerciseYes | -0.6725 | 0.2150 | -3.1274 | 0.0018 | -1.0940 | -0.2510 |
| age_group25-29 | -0.9149 | 0.5198 | -1.7602 | 0.0784 | -1.9338 | 0.1040 |
| age_group30-34 | -0.8823 | 0.5061 | -1.7434 | 0.0813 | -1.8743 | 0.1097 |
| age_group35-39 | -1.5810 | 0.4877 | -3.2421 | 0.0012 | -2.5369 | -0.6251 |
| age_group40-44 | -2.6157 | 0.4858 | -5.3847 | 0.0000 | -3.5680 | -1.6635 |
| age_group45-49 | -2.8246 | 0.4970 | -5.6836 | 0.0000 | -3.7988 | -1.8504 |
| age_group50-54 | -3.2600 | 0.4821 | -6.7628 | 0.0000 | -4.2050 | -2.3151 |
| age_group55-59 | -4.2301 | 0.4741 | -8.9219 | 0.0000 | -5.1595 | -3.3007 |
| age_group60-64 | -4.2484 | 0.4568 | -9.3000 | 0.0000 | -5.1439 | -3.3529 |
| age_group65-69 | -5.2338 | 0.4467 | -11.7163 | 0.0000 | -6.1095 | -4.3582 |
| age_group70-74 | -5.7023 | 0.4545 | -12.5457 | 0.0000 | -6.5933 | -4.8113 |
| age_group75-79 | -5.8977 | 0.4703 | -12.5401 | 0.0000 | -6.8197 | -4.9758 |
| age_group80+ | -6.4888 | 0.4737 | -13.6975 | 0.0000 | -7.4174 | -5.5602 |
| income$15,000-$24,999 | -1.6797 | 0.4698 | -3.5754 | 0.0004 | -2.6007 | -0.7588 |
| income$25,000-$34,999 | -2.1023 | 0.4486 | -4.6861 | 0.0000 | -2.9817 | -1.2229 |
| income$35,000-$49,999 | -2.5869 | 0.4390 | -5.8931 | 0.0000 | -3.4474 | -1.7264 |
| income$50,000-$99,999 | -3.0823 | 0.4114 | -7.4921 | 0.0000 | -3.8887 | -2.2758 |
| income$100,000-$199,999 | -3.5360 | 0.4300 | -8.2232 | 0.0000 | -4.3789 | -2.6931 |
| income$200,000 or more | -3.8625 | 0.5011 | -7.7078 | 0.0000 | -4.8448 | -2.8802 |
| educationHigh school diploma or GED | 0.2139 | 0.8119 | 0.2634 | 0.7922 | -1.3776 | 1.8053 |
| educationSome college or technical school | 1.1965 | 0.6907 | 1.7323 | 0.0833 | -0.1575 | 2.5505 |
| educationCollege graduate | 1.9035 | 0.6922 | 2.7499 | 0.0060 | 0.5466 | 3.2604 |
| educationGraduate or professional degree | 1.7456 | 0.6938 | 2.5160 | 0.0119 | 0.3856 | 3.1057 |
# The * operator includes sex, smoker_current, AND their interaction term
mod_B <- lm(
menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
exercise + age_group + income + education,
data = brfss_analytic
)
tidy(mod_B, conf.int = TRUE) |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.2. Model B — With sex x smoker_current Interaction",
col.names = c("Term","Estimate","SE","t","p-value","95% CI Lower","95% CI Upper")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) |>
scroll_box(height = "400px")| Term | Estimate | SE | t | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 6.8994 | 0.9286 | 7.4301 | 0.0000 | 5.0791 | 8.7196 |
| physhlth_days | 0.2686 | 0.0101 | 26.6788 | 0.0000 | 0.2489 | 0.2883 |
| bmi | 0.0331 | 0.0132 | 2.5017 | 0.0124 | 0.0072 | 0.0590 |
| sexFemale | 1.1855 | 0.1775 | 6.6784 | 0.0000 | 0.8376 | 1.5335 |
| smoker_currentCurrent smoker | 1.5208 | 0.3654 | 4.1621 | 0.0000 | 0.8045 | 2.2371 |
| exerciseYes | -0.6728 | 0.2150 | -3.1301 | 0.0018 | -1.0942 | -0.2515 |
| age_group25-29 | -0.9202 | 0.5196 | -1.7709 | 0.0766 | -1.9388 | 0.0984 |
| age_group30-34 | -0.8924 | 0.5059 | -1.7640 | 0.0778 | -1.8841 | 0.0993 |
| age_group35-39 | -1.5929 | 0.4875 | -3.2673 | 0.0011 | -2.5485 | -0.6372 |
| age_group40-44 | -2.6286 | 0.4856 | -5.4127 | 0.0000 | -3.5806 | -1.6766 |
| age_group45-49 | -2.8425 | 0.4969 | -5.7209 | 0.0000 | -3.8165 | -1.8686 |
| age_group50-54 | -3.2778 | 0.4820 | -6.8012 | 0.0000 | -4.2226 | -2.3331 |
| age_group55-59 | -4.2499 | 0.4740 | -8.9652 | 0.0000 | -5.1791 | -3.3206 |
| age_group60-64 | -4.2640 | 0.4567 | -9.3364 | 0.0000 | -5.1593 | -3.3688 |
| age_group65-69 | -5.2506 | 0.4466 | -11.7563 | 0.0000 | -6.1261 | -4.3751 |
| age_group70-74 | -5.7111 | 0.4544 | -12.5686 | 0.0000 | -6.6018 | -4.8203 |
| age_group75-79 | -5.9076 | 0.4702 | -12.5646 | 0.0000 | -6.8292 | -4.9859 |
| age_group80+ | -6.4995 | 0.4736 | -13.7239 | 0.0000 | -7.4278 | -5.5711 |
| income$15,000-$24,999 | -1.6357 | 0.4700 | -3.4804 | 0.0005 | -2.5570 | -0.7144 |
| income$25,000-$34,999 | -2.0746 | 0.4486 | -4.6243 | 0.0000 | -2.9540 | -1.1952 |
| income$35,000-$49,999 | -2.5455 | 0.4392 | -5.7964 | 0.0000 | -3.4064 | -1.6847 |
| income$50,000-$99,999 | -3.0430 | 0.4116 | -7.3935 | 0.0000 | -3.8498 | -2.2362 |
| income$100,000-$199,999 | -3.5097 | 0.4300 | -8.1623 | 0.0000 | -4.3526 | -2.6668 |
| income$200,000 or more | -3.8405 | 0.5010 | -7.6651 | 0.0000 | -4.8226 | -2.8583 |
| educationHigh school diploma or GED | 0.1256 | 0.8124 | 0.1546 | 0.8772 | -1.4669 | 1.7180 |
| educationSome college or technical school | 1.1179 | 0.6912 | 1.6172 | 0.1059 | -0.2371 | 2.4729 |
| educationCollege graduate | 1.8179 | 0.6928 | 2.6239 | 0.0087 | 0.4598 | 3.1760 |
| educationGraduate or professional degree | 1.6691 | 0.6943 | 2.4040 | 0.0162 | 0.3081 | 3.0300 |
| sexFemale:smoker_currentCurrent smoker | 1.2833 | 0.5171 | 2.4819 | 0.0131 | 0.2697 | 2.2968 |
int_test <- anova(mod_A, mod_B)
int_test |>
tidy() |>
mutate(across(where(is.numeric), ~ round(., 4))) |>
kable(
caption = "Table 3.3. Partial F-Test: Model A (no interaction) vs. Model B (with interaction)",
col.names = c("Model","Res. df","RSS","df","Sum of Sq","F","p-value")
) |>
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Model | Res. df | RSS | df | Sum of Sq | F | p-value |
|---|---|---|---|---|---|---|
| menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education | 7972 | 432508.9 | NA | NA | NA | NA |
| menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education | 7971 | 432174.9 | 1 | 333.9749 | 6.1598 | 0.0131 |
\[H_0: \beta_{\text{Female} \times \text{Current smoker}} = 0 \quad \text{(smoking–mental health association is the same for men and women)}\]
\[H_A: \beta_{\text{Female} \times \text{Current smoker}} \neq 0 \quad \text{(the association differs by sex)}\]
\(F(1, 7971) = 6.16\), \(p = 0.0131\). We reject \(H_0\) at \(\alpha = 0.05\) and conclude that the association between current smoking and mentally unhealthy days differs significantly by sex.
b_B <- round(coef(mod_B), 3)
smoking_men <- b_B["smoker_currentCurrent smoker"]
smoking_women <- b_B["smoker_currentCurrent smoker"] +
b_B["sexFemale:smoker_currentCurrent smoker"]
cat("Estimated association of current smoking with mentally unhealthy days:\n")## Estimated association of current smoking with mentally unhealthy days:
## Among men : 1.521 days
## Among women : 2.804 days
## Difference : 1.283 days
Among men (reference sex group): Male current smokers report an estimated 1.521 days more mentally unhealthy days compared to male non-smokers, holding all other predictors constant.
Among women: The estimated association of current smoking for women = 1.521 + 1.283 = 2.804 days. Female current smokers report an estimated 2.804 more mentally unhealthy days compared to female non-smokers, holding all other variables constant.
Together: The difference in the smoking effect between women and men is 1.283 days. Current smoking is more strongly associated with mentally unhealthy days among women than among men.
pred_int <- ggpredict(mod_B, terms = c("smoker_current", "sex"))
# Convert to data frame for full control over the plot
pred_df <- as.data.frame(pred_int)
ggplot(pred_df, aes(x = x, y = predicted, color = group, group = group)) +
geom_point(size = 4, position = position_dodge(width = 0.3)) +
geom_line(linewidth = 1.2, position = position_dodge(width = 0.3)) +
geom_errorbar(
aes(ymin = conf.low, ymax = conf.high),
width = 0.15,
linewidth = 0.8,
position = position_dodge(width = 0.3)
) +
scale_color_brewer(palette = "Set1", name = "Sex") +
labs(
title = "Predicted Mentally Unhealthy Days by Smoking Status and Sex",
subtitle = "Model B: sex x smoker_current interaction, adjusted for all other predictors",
x = "Smoking Status",
y = "Predicted Mentally Unhealthy Days (Past 30)",
caption = "Points = model-adjusted predictions; error bars = 95% confidence intervals."
) +
theme_minimal(base_size = 13) +
theme(legend.position = "right")Figure 3.1. Predicted Mentally Unhealthy Days by Smoking Status and Sex (Model B)
Among U.S. adults, current smokers report more mentally unhealthy days on average than former or never smokers, even after accounting for age, sex, income, education, physical health, and exercise. The strength of this association appears to differ between men and women, with women who smoke experiencing a larger gap in mental health burden compared to non-smoking women, relative to the corresponding gap among men. These findings suggest that public health programs addressing both tobacco use and mental health may benefit from sex-specific approaches. Because this analysis is cross-sectional, we cannot determine the direction of causality — poor mental health may increase the likelihood of smoking, smoking may worsen mental health, or both may reflect shared upstream causes.
The results suggest that mentally unhealthy days among U.S. adults are associated with a wide range of individual, behavioral, and socioeconomic factors. Physical health days emerged as the strongest predictor, consistent with the extensive literature documenting the bidirectional relationship between physical illness and psychological distress. Current smokers — particularly daily smokers — reported substantially more mentally unhealthy days than former or never smokers even after controlling for other variables, and the interaction analysis indicates this association may differ by sex. Income and education both showed clear inverse gradients, with lower socioeconomic status linked to higher mental health burden, reinforcing the central role of social determinants in population mental health. Predictors with weaker associations such as BMI may operate through intermediate pathways not captured in this cross-sectional model. The primary limitation is the study’s cross-sectional design: because exposure and outcome are measured simultaneously, we cannot establish temporal precedence or rule out reverse causality — for example, we cannot determine whether smoking worsens mental health or whether psychological distress drives smoking initiation. The BRFSS also relies on self-reported telephone interviews, which are subject to recall bias and social desirability bias for sensitive outcomes. Two specific confounders likely to bias the estimated associations include social support networks, individuals with weaker social ties are simultaneously more likely to smoke and to experience psychological distress — and pre-existing mental health diagnoses, people with diagnosed depression or anxiety have markedly elevated smoking rates, which would inflate the observed smoking–mental health association beyond any causal pathway.
Adjusted \(R^2\) offers an important
correction that raw \(R^2\) does not:
because \(R^2\) can only increase as
predictors are added, including even uninformative variables will
artificially improve apparent fit. In a model with many categorical
predictors — each generating multiple dummy variables — this inflation
can be substantial. Adjusted \(R^2\)
penalizes each additional degree of freedom consumed, providing a more
honest assessment of explanatory power. The small gap between \(R^2\) (0.1853) and Adjusted \(R^2\) (0.1824) confirms the predictors are
genuinely informative rather than noise. Chunk tests address a
complementary limitation of individual t-tests: when a categorical
predictor is represented by multiple dummy variables, individual
coefficients may not reach significance even when the variable as a
whole is strongly predictive, because each comparison targets only a
narrow contrast against the reference group. Testing income and
education collectively using anova() asks the more
appropriate question — does the variable as a whole explain meaningful
variation? — and both passed this test decisively.
I used AI assistance (Claude) for replacing deprecated
geom_errorbarh() with
geom_errorbar(orientation = "y"). I struggled trying to get
the plot. Also when I completed the work I had an error when trying to
knit so I checked out and realized there was a chuck to be removed which
was done on chatgpt with the prompt I had.