cat("Downloading NHANES 2017-2018 files...\n")
## Downloading NHANES 2017-2018 files...
demo <- nhanes("DEMO_J") # Demographics
slq <- nhanes("SLQ_J") # Sleep questionnaire
paq <- nhanes("PAQ_J") # Physical activity
bmx <- nhanes("BMX_J") # Body measures
dpq <- nhanes("DPQ_J") # PHQ-9 depression screener
inq <- nhanes("INQ_J") # Income
cat("Files loaded:\n")
## Files loaded:
cat(" DEMO_J:", nrow(demo), "| SLQ_J:", nrow(slq),
"| PAQ_J:", nrow(paq), "| BMX_J:", nrow(bmx),
"| DPQ_J:", nrow(dpq), "| INQ_J:", nrow(inq), "\n")
## DEMO_J: 9254 | SLQ_J: 6161 | PAQ_J: 5856 | BMX_J: 8704 | DPQ_J: 5533 | INQ_J: 9254
All six files are merged by SEQN (respondent sequence number) using sequential left joins anchored to DEMO_J.
nhanes_merged <- demo |>
left_join(slq, by = "SEQN") |>
left_join(paq, by = "SEQN") |>
left_join(bmx, by = "SEQN") |>
left_join(dpq, by = "SEQN") |>
left_join(inq, by = "SEQN")
starting_n <- nrow(nhanes_merged)
cat("Starting merged sample N =", starting_n, "\n")
## Starting merged sample N = 9254
Four exclusion criteria applied sequentially.
# Step 1: Restrict to adults aged 20–70 years
step1 <- nhanes_merged |> dplyr::filter(RIDAGEYR >= 20 & RIDAGEYR <= 70)
n1 <- nrow(step1)
cat("Step 1 — Age 20-70:", n1, "| excluded:", starting_n - n1, "\n")
## Step 1 — Age 20-70: 4606 | excluded: 4648
# Step 2: Exclude missing sleep duration (SLD012)
step2 <- step1 |> dplyr::filter(!is.na(SLD012))
n2 <- nrow(step2)
cat("Step 2 — Complete sleep:", n2, "| excluded:", n1 - n2, "\n")
## Step 2 — Complete sleep: 4573 | excluded: 33
# Step 3: Exclude ambiguous physical activity (PAQ605 codes 3 = Don't know)
# as.integer() strips haven labels before filtering
step3 <- step2 |> dplyr::filter(as.integer(PAQ605) %in% c(1L, 2L))
n3 <- nrow(step3)
cat("Step 3 — Valid physical activity:", n3, "| excluded:", n2 - n3, "\n")
## Step 3 — Valid physical activity: 4569 | excluded: 4
# Step 4: Exclude missing BMI
step4 <- step3 |> dplyr::filter(!is.na(BMXBMI))
n4 <- nrow(step4)
cat("Step 4 — Complete BMI:", n4, "| excluded:", n3 - n4, "\n")
## Step 4 — Complete BMI: 4297 | excluded: 272
cat("\nFINAL ANALYTICAL SAMPLE N =", n4, "\n")
##
## FINAL ANALYTICAL SAMPLE N = 4297
cat("Retained:", round(100 * n4 / starting_n, 1), "% of starting sample\n")
## Retained: 46.4 % of starting sample
nhanes_analysis <- step4 |>
dplyr::mutate(
# ── OUTCOME ───────────────────────────────────────────────────────────────
# SLD012: usual sleep hours on workdays/weekdays (continuous, hours/night)
sleep_hrs = as.numeric(SLD012),
# ── PRIMARY EXPOSURE ──────────────────────────────────────────────────────
# PAQ605: vigorous or moderate physical activity in past week
# 1 = Yes, 2 = No; reference level = No (inactive)
phys_active = factor(
dplyr::case_when(
as.integer(PAQ605) == 1L ~ "Yes",
as.integer(PAQ605) == 2L ~ "No",
TRUE ~ NA_character_
),
levels = c("No", "Yes")
),
# ── EFFECT MODIFIER ───────────────────────────────────────────────────────
# Age group: 3 levels; reference = 20-39 years (youngest group)
age_group = factor(
dplyr::case_when(
RIDAGEYR >= 20 & RIDAGEYR <= 39 ~ "20-39 years",
RIDAGEYR >= 40 & RIDAGEYR <= 59 ~ "40-59 years",
RIDAGEYR >= 60 & RIDAGEYR <= 70 ~ "60-70 years"
),
levels = c("20-39 years", "40-59 years", "60-70 years")
),
# Continuous age (for correlations)
age = RIDAGEYR,
# ── COVARIATES ────────────────────────────────────────────────────────────
# Gender: 1 = Male, 2 = Female; reference = Male
gender = factor(
dplyr::case_when(
as.integer(RIAGENDR) == 1L ~ "Male",
as.integer(RIAGENDR) == 2L ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
),
# BMI: continuous (kg/m²)
bmi = as.numeric(BMXBMI),
# Income-to-poverty ratio: continuous (< 1.0 = below poverty line)
income_poverty = as.numeric(INDFMPIR),
# Mental health — BINARY (primary specification)
# DPQ010 (PHQ-9 item 1): 0-1 = Good; 2-3 = Poor
# Rationale: scores 2-3 reflect clinically meaningful frequency
# (more than half the days / nearly every day), consistent with PHQ-9
# item-level scoring conventions.
mental_health_bin = dplyr::case_when(
as.integer(DPQ010) %in% c(0L, 1L) ~ "Good",
as.integer(DPQ010) %in% c(2L, 3L) ~ "Poor",
TRUE ~ NA_character_
),
# Mental health — ORDINAL (for sensitivity analysis)
mental_health_ord = dplyr::if_else(
as.integer(DPQ010) %in% 0L:3L,
as.numeric(DPQ010),
NA_real_
),
# ── SURVEY DESIGN VARIABLES ───────────────────────────────────────────────
survey_psu = SDMVPSU,
survey_strata = SDMVSTRA,
survey_weight = WTINT2YR
)
final_n <- nrow(nhanes_analysis)
cat("Analytical dataset created: N =", final_n, "\n")
## Analytical dataset created: N = 4297
# Verify DPQ010 recoding
cat("\nDPQ010 raw distribution (before recoding):\n")
##
## DPQ010 raw distribution (before recoding):
print(table(nhanes_analysis$DPQ010, useNA = "always"))
##
## Not at all Several days More than half the days
## 2984 657 227
## Nearly every day Refused Don't know
## 145 1 4
## <NA>
## 279
cat("\nmental_health_bin distribution:\n")
##
## mental_health_bin distribution:
print(table(nhanes_analysis$mental_health_bin, useNA = "always"))
##
## Good Poor <NA>
## 2984 884 429
key_vars <- c("sleep_hrs", "phys_active", "age_group", "gender",
"bmi", "income_poverty", "mental_health_bin")
missing_summary <- nhanes_analysis |>
dplyr::select(all_of(key_vars)) |>
dplyr::summarise(dplyr::across(everything(), ~ sum(is.na(.)))) |>
tidyr::pivot_longer(everything(),
names_to = "Variable",
values_to = "N_Missing") |>
dplyr::mutate(
N_Total = final_n,
Pct_Missing = round(N_Missing / N_Total * 100, 1),
Flag = ifelse(Pct_Missing > 10, ">10% (high)", "<=10% (acceptable)")
) |>
dplyr::arrange(desc(Pct_Missing))
missing_summary |>
knitr::kable(
col.names = c("Variable", "N Missing", "N Total", "% Missing", "Flag"),
caption = "Table S1. Missing Data Summary for Key Analytic Variables"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Variable | N Missing | N Total | % Missing | Flag |
|---|---|---|---|---|
| income_poverty | 561 | 4297 | 13.1 | >10% (high) |
| mental_health_bin | 429 | 4297 | 10.0 | <=10% (acceptable) |
| sleep_hrs | 0 | 4297 | 0.0 | <=10% (acceptable) |
| phys_active | 0 | 4297 | 0.0 | <=10% (acceptable) |
| age_group | 0 | 4297 | 0.0 | <=10% (acceptable) |
| gender | 0 | 4297 | 0.0 | <=10% (acceptable) |
| bmi | 0 | 4297 | 0.0 | <=10% (acceptable) |
The survey design object is created here on the full analytical sample (N = 4,297) for the descriptive table, then re-created on the complete-case sample (N = 3,396) for regression models.
survey_full <- nhanes_analysis |>
dplyr::filter(
!is.na(survey_psu),
!is.na(survey_strata),
!is.na(survey_weight)
)
nhanes_design_full <- svydesign(
id = ~survey_psu,
strata = ~survey_strata,
weights = ~survey_weight,
nest = TRUE,
data = survey_full
)
cat("Full survey design object: N =", nrow(survey_full), "\n")
## Full survey design object: N = 4297
tbl_svysummary(
data = nhanes_design_full,
include = c(sleep_hrs, age_group, gender, bmi, income_poverty,
mental_health_bin),
by = phys_active,
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n_unweighted} ({p}%)"
),
digits = list(
all_continuous() ~ 1,
all_categorical() ~ c(0, 1)
),
label = list(
sleep_hrs ~ "Sleep duration (hours/night)",
age_group ~ "Age group",
gender ~ "Gender",
bmi ~ "BMI (kg/m²)",
income_poverty ~ "Income-to-poverty ratio",
mental_health_bin ~ "Mental health status"
),
missing_text = "Missing"
) |>
add_overall() |>
add_p() |>
modify_header(
label ~ "**Characteristic**",
all_stat_cols() ~ "**{level}**\nN = {n_unweighted} ({p}%)"
) |>
modify_spanning_header(
c("stat_1", "stat_2") ~ "**Physical Activity Status**"
) |>
modify_caption(
"**Table 1. Participant Characteristics by Physical Activity Status,
NHANES 2017-2018 (Survey-Weighted)**"
) |>
bold_labels() |>
italicize_levels()
| Characteristic | Overall N = 4297 (1%) |
Physical Activity Status
|
p-value1 | |
|---|---|---|---|---|
| No N = 3162 (0.714610156109615%) | Yes N = 1135 (0.285389843890385%) | |||
| Sleep duration (hours/night), Mean (SD) | 7.5 (1.5) | 7.6 (1.5) | 7.2 (1.5) | <0.001 |
| Age group, n (unweighted) (%) | 0.002 | |||
| 20-39 years | 1,565 (40.8%) | 1,058 (37.9%) | 507 (47.9%) | |
| 40-59 years | 1,625 (40.1%) | 1,224 (41.7%) | 401 (36.1%) | |
| 60-70 years | 1,107 (19.1%) | 880 (20.4%) | 227 (16.0%) | |
| Gender, n (unweighted) (%) | <0.001 | |||
| Male | 2,049 (48.3%) | 1,323 (41.6%) | 726 (65.0%) | |
| Female | 2,248 (51.7%) | 1,839 (58.4%) | 409 (35.0%) | |
| BMI (kg/m²), Mean (SD) | 29.9 (7.5) | 29.8 (7.6) | 30.2 (7.2) | 0.19 |
| Income-to-poverty ratio, Mean (SD) | 3.1 (1.7) | 3.2 (1.7) | 2.8 (1.6) | 0.004 |
| Missing | 20,466,829 | 15,158,258 | 5,308,571 | |
| Mental health status, n (unweighted) (%) | 0.25 | |||
| Good | 2,984 (78.1%) | 2,202 (78.6%) | 782 (76.8%) | |
| Poor | 884 (21.9%) | 634 (21.4%) | 250 (23.2%) | |
| Missing | 15,916,905 | 12,159,792 | 3,757,114 | |
| 1 Design-based KruskalWallis test; Pearson’s X^2: Rao & Scott adjustment | ||||
# Restrict to complete cases on all model variables
# mental_health_bin is factored HERE, after filtering
nhanes_complete <- nhanes_analysis |>
dplyr::filter(
!is.na(sleep_hrs),
!is.na(phys_active),
!is.na(age_group),
!is.na(gender),
!is.na(bmi),
!is.na(income_poverty),
!is.na(mental_health_bin),
!is.na(survey_psu),
!is.na(survey_strata),
!is.na(survey_weight)
) |>
dplyr::mutate(
# Factor HERE — after filtering — so both levels are guaranteed present
mental_health_bin = factor(mental_health_bin, levels = c("Good", "Poor"))
)
cat("Complete cases for modeling: N =", nrow(nhanes_complete), "\n")
## Complete cases for modeling: N = 3396
cat("Excluded for missing covariates:", final_n - nrow(nhanes_complete), "\n")
## Excluded for missing covariates: 901
# Verify all factors have >= 2 levels (safeguard against contrasts error)
check_factor_levels <- function(df, vars) {
for (v in vars) {
n_lev <- nlevels(df[[v]])
cat(sprintf(" %-22s levels = %d | table: %s\n",
v, n_lev,
paste(names(table(df[[v]])), table(df[[v]]),
sep = "=", collapse = " ")))
if (n_lev < 2) {
stop(paste0("FACTOR '", v, "' HAS ONLY ", n_lev,
" LEVEL(S). Check recode and filter steps."))
}
}
cat(" All factors OK.\n")
}
cat("\n=== Pre-flight factor check ===\n")
##
## === Pre-flight factor check ===
check_factor_levels(
nhanes_complete,
c("phys_active", "age_group", "gender", "mental_health_bin")
)
## phys_active levels = 2 | table: No=2486 Yes=910
## age_group levels = 3 | table: 20-39 years=1239 40-59 years=1290 60-70 years=867
## gender levels = 2 | table: Male=1638 Female=1758
## mental_health_bin levels = 2 | table: Good=2622 Poor=774
## All factors OK.
# Survey design object for complete-case regression models
svy_design <- svydesign(
id = ~survey_psu,
strata = ~survey_strata,
weights = ~survey_weight,
nest = TRUE,
data = nhanes_complete
)
cat("\nSurvey design for models: N =", nrow(nhanes_complete), "\n")
##
## Survey design for models: N = 3396
mod1_unadj <- svyglm(
sleep_hrs ~ phys_active,
design = svy_design,
family = gaussian()
)
mod1_unadj |>
tbl_regression(
label = list(phys_active ~ "Physical activity (Yes vs. No)"),
conf.int = TRUE
) |>
bold_labels() |>
bold_p(t = 0.05) |>
modify_caption("**Table 2a. Model 1: Unadjusted Survey-Weighted Linear Regression**")
| Characteristic | Beta (95% CI) | p-value |
|---|---|---|
| Physical activity (Yes vs. No) | ||
| No | — | |
| Yes | -0.40 (-0.53 to -0.27) | <0.001 |
| Abbreviation: CI = Confidence Interval | ||
mod2_full <- svyglm(
sleep_hrs ~ phys_active + age_group + gender + bmi +
income_poverty + mental_health_bin,
design = svy_design,
family = gaussian()
)
mod2_full |>
tbl_regression(
label = list(
phys_active ~ "Physical activity (Yes vs. No)",
age_group ~ "Age group",
gender ~ "Gender",
bmi ~ "BMI (per 1 kg/m²)",
income_poverty ~ "Income-to-poverty ratio (per unit)",
mental_health_bin ~ "Mental health status"
),
conf.int = TRUE
) |>
bold_labels() |>
bold_p(t = 0.05) |>
italicize_levels() |>
modify_caption("**Table 2b. Model 2: Full Multivariable Survey-Weighted Linear Regression**")
| Characteristic | Beta (95% CI) | p-value |
|---|---|---|
| Physical activity (Yes vs. No) | ||
| No | — | |
| Yes | -0.37 (-0.55 to -0.18) | 0.002 |
| Age group | ||
| 20-39 years | — | |
| 40-59 years | -0.29 (-0.45 to -0.13) | 0.003 |
| 60-70 years | -0.06 (-0.31 to 0.19) | 0.58 |
| Gender | ||
| Male | — | |
| Female | 0.28 (0.12 to 0.43) | 0.003 |
| BMI (per 1 kg/m²) | -0.01 (-0.02 to 0.01) | 0.26 |
| Income-to-poverty ratio (per unit) | -0.05 (-0.12 to 0.01) | 0.10 |
| Mental health status | ||
| Good | — | |
| Poor | -0.04 (-0.26 to 0.18) | 0.68 |
| Abbreviation: CI = Confidence Interval | ||
beta_unadj <- coef(mod1_unadj)["phys_activeYes"]
beta_full <- coef(mod2_full)["phys_activeYes"]
pct_change <- round(100 * (beta_full - beta_unadj) / abs(beta_unadj), 1)
data.frame(
Model = c("Model 1: Unadjusted", "Model 2: Full Multivariable"),
Beta = round(c(beta_unadj, beta_full), 4),
CI_Lower = round(c(confint(mod1_unadj)["phys_activeYes", 1],
confint(mod2_full)["phys_activeYes", 1]), 4),
CI_Upper = round(c(confint(mod1_unadj)["phys_activeYes", 2],
confint(mod2_full)["phys_activeYes", 2]), 4),
Pct_Change = c(NA, pct_change)
) |>
knitr::kable(
col.names = c("Model", "beta (PA)", "95% CI Lower", "95% CI Upper",
"% Change from Crude"),
caption = "Table 2c. Confounding Assessment: PA Coefficient, Unadjusted vs. Adjusted"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Model | beta (PA) | 95% CI Lower | 95% CI Upper | % Change from Crude | |
|---|---|---|---|---|---|
| Model 1: Unadjusted | -0.4004 | -0.5336 | -0.2673 | NA | |
| phys_activeYes | Model 2: Full Multivariable | -0.3659 | -0.5483 | -0.1835 | 8.6 |
cat("% change in PA beta after full adjustment:", pct_change, "%\n")
## % change in PA beta after full adjustment: 8.6 %
cat(ifelse(abs(pct_change) >= 10,
"Interpretation: Meaningful confounding present (>=10% change).",
"Interpretation: Minimal confounding (<10% change)."), "\n")
## Interpretation: Minimal confounding (<10% change).
mod3_interact <- svyglm(
sleep_hrs ~ phys_active * age_group + gender + bmi +
income_poverty + mental_health_bin,
design = svy_design,
family = gaussian()
)
mod3_interact |>
tbl_regression(
label = list(
phys_active ~ "Physical activity (Yes vs. No)",
age_group ~ "Age group",
gender ~ "Gender",
bmi ~ "BMI (per 1 kg/m²)",
income_poverty ~ "Income-to-poverty ratio (per unit)",
mental_health_bin ~ "Mental health status"
),
conf.int = TRUE
) |>
bold_labels() |>
bold_p(t = 0.05) |>
italicize_levels() |>
modify_caption("**Table 2d. Model 3: Interaction Model (Physical Activity x Age Group)**")
| Characteristic | Beta (95% CI) | p-value |
|---|---|---|
| Physical activity (Yes vs. No) | ||
| No | — | |
| Yes | -0.39 (-0.63 to -0.15) | 0.007 |
| Age group | ||
| 20-39 years | — | |
| 40-59 years | -0.31 (-0.52 to -0.10) | 0.012 |
| 60-70 years | -0.07 (-0.36 to 0.23) | 0.60 |
| Gender | ||
| Male | — | |
| Female | 0.28 (0.12 to 0.44) | 0.006 |
| BMI (per 1 kg/m²) | -0.01 (-0.02 to 0.01) | 0.27 |
| Income-to-poverty ratio (per unit) | -0.05 (-0.12 to 0.02) | 0.11 |
| Mental health status | ||
| Good | — | |
| Poor | -0.04 (-0.27 to 0.19) | 0.68 |
| Physical activity (Yes vs. No) * Age group | ||
| Yes * 40-59 years | 0.06 (-0.29 to 0.42) | 0.67 |
| Yes * 60-70 years | 0.00 (-0.55 to 0.54) | 0.99 |
| Abbreviation: CI = Confidence Interval | ||
# Formal Wald test for joint significance of interaction terms
# regTermTest() is the correct method for survey-weighted models
wald_result <- regTermTest(mod3_interact, ~phys_active:age_group)
cat("\n=== WALD TEST: Physical Activity x Age Group Interaction ===\n")
##
## === WALD TEST: Physical Activity x Age Group Interaction ===
print(wald_result)
## Wald test for phys_active:age_group
## in svyglm(formula = sleep_hrs ~ phys_active * age_group + gender +
## bmi + income_poverty + mental_health_bin, design = svy_design,
## family = gaussian())
## F = 0.09829152 on 2 and 6 df: p= 0.90781
wald_p <- wald_result$p
cat("\np-value for interaction:", round(wald_p, 4), "\n")
##
## p-value for interaction: 0.9078
cat(ifelse(wald_p < 0.05,
"Conclusion: Statistically significant interaction detected.",
"Conclusion: No statistically significant interaction (p >= 0.05)."),
"\n")
## Conclusion: No statistically significant interaction (p >= 0.05).
Per Check-in 1 feedback: tests whether binary vs. ordinal mental health coding changes the primary physical activity estimate.
nhanes_sens_mh <- nhanes_complete |> dplyr::filter(!is.na(mental_health_ord))
svy_sens_mh <- svydesign(
id = ~survey_psu,
strata = ~survey_strata,
weights = ~survey_weight,
nest = TRUE,
data = nhanes_sens_mh
)
mod_bin_mh <- svyglm(
sleep_hrs ~ phys_active + age_group + gender + bmi +
income_poverty + mental_health_bin,
design = svy_sens_mh, family = gaussian()
)
mod_ord_mh <- svyglm(
sleep_hrs ~ phys_active + age_group + gender + bmi +
income_poverty + mental_health_ord,
design = svy_sens_mh, family = gaussian()
)
pct_diff_mh <- round(
100 * abs(coef(mod_ord_mh)["phys_activeYes"] -
coef(mod_bin_mh)["phys_activeYes"]) /
abs(coef(mod_bin_mh)["phys_activeYes"]), 1
)
data.frame(
Specification = c("Binary mental health (Good/Poor)",
"Ordinal mental health (0-3)"),
Beta_PhysActive = round(c(coef(mod_bin_mh)["phys_activeYes"],
coef(mod_ord_mh)["phys_activeYes"]), 4),
CI_Lower = round(c(confint(mod_bin_mh)["phys_activeYes", 1],
confint(mod_ord_mh)["phys_activeYes", 1]), 4),
CI_Upper = round(c(confint(mod_bin_mh)["phys_activeYes", 2],
confint(mod_ord_mh)["phys_activeYes", 2]), 4),
Pct_Change = c(NA, pct_diff_mh)
) |>
knitr::kable(
col.names = c("Mental Health Specification", "beta (PA)",
"95% CI Lower", "95% CI Upper", "% Change"),
caption = "Table S2. Sensitivity Analysis 1 — Mental Health Coding"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Mental Health Specification | beta (PA) | 95% CI Lower | 95% CI Upper | % Change | |
|---|---|---|---|---|---|
| Binary mental health (Good/Poor) | -0.3659 | -0.5483 | -0.1835 | NA | |
| phys_activeYes | Ordinal mental health (0-3) | -0.3656 | -0.5481 | -0.1831 | 0.1 |
cat("% difference in PA beta between codings:", pct_diff_mh, "%\n")
## % difference in PA beta between codings: 0.1 %
cat(ifelse(pct_diff_mh < 5,
"Conclusion: <5% change — binary coding is appropriate.",
"Conclusion: >=5% change — ordinal coding preferred."), "\n")
## Conclusion: <5% change — binary coding is appropriate.
Per Check-in 2 feedback: tests robustness after excluding participants reporting sleep < 4 or > 12 hours per night (n = 98 in complete-case sample).
# Identify extreme sleep values in complete-case sample
n_outliers <- sum(nhanes_complete$sleep_hrs < 4 | nhanes_complete$sleep_hrs > 12)
cat("N extreme sleep values (<4 or >12 hrs) in complete-case sample:",
n_outliers,
sprintf("(%.1f%%)\n", 100 * n_outliers / nrow(nhanes_complete)))
## N extreme sleep values (<4 or >12 hrs) in complete-case sample: 67 (2.0%)
# Exclude extreme values
nhanes_no_outliers <- nhanes_complete |>
dplyr::filter(sleep_hrs >= 4 & sleep_hrs <= 12)
cat("N after excluding extreme sleep values:", nrow(nhanes_no_outliers), "\n")
## N after excluding extreme sleep values: 3329
svy_no_outliers <- svydesign(
id = ~survey_psu,
strata = ~survey_strata,
weights = ~survey_weight,
nest = TRUE,
data = nhanes_no_outliers
)
mod_no_outliers <- svyglm(
sleep_hrs ~ phys_active + age_group + gender + bmi +
income_poverty + mental_health_bin,
design = svy_no_outliers, family = gaussian()
)
beta_primary <- coef(mod2_full)["phys_activeYes"]
beta_no_out <- coef(mod_no_outliers)["phys_activeYes"]
pct_change_out <- round(100 * abs(beta_no_out - beta_primary) /
abs(beta_primary), 1)
data.frame(
Sample = c("Primary (N = 3,396)",
paste0("Excluding extreme sleep (N = ",
nrow(nhanes_no_outliers), ")")),
Beta_PhysActive = round(c(beta_primary, beta_no_out), 4),
CI_Lower = round(c(confint(mod2_full)["phys_activeYes", 1],
confint(mod_no_outliers)["phys_activeYes", 1]), 4),
CI_Upper = round(c(confint(mod2_full)["phys_activeYes", 2],
confint(mod_no_outliers)["phys_activeYes", 2]), 4),
Pct_Change = c(NA, pct_change_out)
) |>
knitr::kable(
col.names = c("Sample", "beta (PA)", "95% CI Lower", "95% CI Upper",
"% Change from Primary"),
caption = "Table S3. Sensitivity Analysis 2 — Excluding Extreme Sleep Duration"
) |>
kableExtra::kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Sample | beta (PA) | 95% CI Lower | 95% CI Upper | % Change from Primary | |
|---|---|---|---|---|---|
| Primary (N = 3,396) | -0.3659 | -0.5483 | -0.1835 | NA | |
| phys_activeYes | Excluding extreme sleep (N = 3329) | -0.3483 | -0.5011 | -0.1955 | 4.8 |
cat("% change in PA beta after excluding outliers:", pct_change_out, "%\n")
## % change in PA beta after excluding outliers: 4.8 %
Diagnostic plots are produced using an equivalent unweighted lm() model on the same complete-case sample. The survey-weighted and unweighted models yield nearly identical fitted values and residuals for this dataset, so these diagnostics are informative for the weighted model.
mod_diag <- lm(
sleep_hrs ~ phys_active + age_group + gender + bmi +
income_poverty + mental_health_bin,
data = nhanes_complete
)
diag_data <- data.frame(
fitted = fitted(mod_diag),
residuals = residuals(mod_diag),
std_resid = rstandard(mod_diag)
)
ggplot(diag_data, aes(x = fitted, y = residuals)) +
geom_point(alpha = 0.3, size = 0.8, color = "steelblue") +
geom_hline(yintercept = 0, color = "red", linewidth = 1, linetype = "dashed") +
geom_smooth(method = "loess", se = FALSE, color = "darkred", linewidth = 1) +
labs(
title = "Figure D1. Residuals vs. Fitted Values",
subtitle = "Assesses linearity assumption",
x = "Fitted Values (Predicted Sleep Duration, hours)",
y = "Residuals (hours)"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 13))
Figure D1. Residuals vs. Fitted Values. Assesses linearity assumption. Red dashed line = zero; smooth curve = loess trend.
ggplot(diag_data, aes(sample = std_resid)) +
stat_qq(alpha = 0.4, color = "steelblue") +
stat_qq_line(color = "red", linewidth = 1) +
labs(
title = "Figure D2. Normal Q-Q Plot of Standardized Residuals",
subtitle = "Assesses normality of residuals",
x = "Theoretical Quantiles",
y = "Standardized Residuals"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 13))
Figure D2. Normal Q-Q plot of standardized residuals. Points should follow the red diagonal if residuals are normally distributed.
diag_data <- diag_data |>
dplyr::mutate(sqrt_abs_std_resid = sqrt(abs(std_resid)))
ggplot(diag_data, aes(x = fitted, y = sqrt_abs_std_resid)) +
geom_point(alpha = 0.3, size = 0.8, color = "steelblue") +
geom_smooth(method = "loess", se = FALSE, color = "darkred", linewidth = 1) +
labs(
title = "Figure D3. Scale-Location Plot",
subtitle = "Assesses homoscedasticity (constant variance)",
x = "Fitted Values (Predicted Sleep Duration, hours)",
y = expression(sqrt("|Standardized Residuals|"))
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 13))
Figure D3. Scale-Location plot. A flat smoother indicates constant variance (homoscedasticity).
diag_data2 <- data.frame(
leverage = hatvalues(mod_diag),
std_resid = rstandard(mod_diag),
cooks_d = cooks.distance(mod_diag)
)
cooks_threshold <- 4 / nrow(nhanes_complete)
n_influential <- sum(diag_data2$cooks_d > cooks_threshold)
ggplot(diag_data2, aes(x = leverage, y = std_resid, size = cooks_d)) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_hline(yintercept = c(-3, 3), linetype = "dashed",
color = "darkred", linewidth = 0.8) +
geom_vline(xintercept = 2 * mean(diag_data2$leverage),
linetype = "dashed", color = "orange", linewidth = 0.8) +
scale_size_continuous(name = "Cook's Distance", range = c(0.5, 4)) +
labs(
title = "Figure D4. Residuals vs. Leverage",
subtitle = paste0("N with Cook's D > 4/n: ", n_influential),
x = "Leverage",
y = "Standardized Residuals"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 13))
Figure D4. Residuals vs. Leverage. Point size = Cook’s distance. Dashed red lines = ±3 SD; orange = high leverage threshold.
cat("Cook's D threshold (4/n):", round(cooks_threshold, 5), "\n")
## Cook's D threshold (4/n): 0.00118
cat("N exceeding threshold:", n_influential,
sprintf("(%.1f%%)\n", 100 * n_influential / nrow(nhanes_complete)))
## N exceeding threshold: 209 (6.2%)
cat("Max Cook's D:", round(max(diag_data2$cooks_d), 5), "\n")
## Max Cook's D: 0.00961
# ggpredict() holds continuous covariates at mean, categorical at reference
pred_pa <- ggpredict(mod2_full, terms = "phys_active")
pred_pa_df <- as.data.frame(pred_pa) |>
dplyr::mutate(x_label = ifelse(x == "No", "Inactive", "Active"))
ggplot(pred_pa_df, aes(x = x_label, y = predicted, color = x_label)) +
geom_point(size = 5) +
geom_errorbar(
aes(ymin = conf.low, ymax = conf.high),
width = 0.15, linewidth = 1.2
) +
scale_color_manual(values = c("Inactive" = "#CD5C5C", "Active" = "#2E8B57")) +
scale_y_continuous(
limits = c(min(pred_pa_df$conf.low) - 0.1,
max(pred_pa_df$conf.high) + 0.1)
) +
labs(
title = "Figure 1. Adjusted Predicted Sleep Duration by Physical Activity Status",
subtitle = "From Model 2 (full multivariable); covariates held at mean/reference values",
x = "Physical Activity Status",
y = "Predicted Mean Sleep Duration (hours per night)",
caption = paste0(
"Error bars = 95% confidence intervals. ",
"Covariates: age group = 20-39 years, gender = Male, ",
"BMI and income-poverty at sample means, mental health = Good. ",
"N = ", nrow(nhanes_complete), "."
)
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 11),
axis.title = element_text(size = 12),
axis.text = element_text(size = 12),
legend.position = "none"
)
Figure 1. Adjusted predicted mean sleep duration by physical activity status (Model 2). Error bars = 95% CIs. Covariates held at reference/mean values. N = 3,396.
coef_data <- broom::tidy(mod2_full, conf.int = TRUE) |>
dplyr::filter(term != "(Intercept)") |>
dplyr::mutate(
term = dplyr::case_when(
term == "phys_activeYes" ~ "Physical Activity: Yes vs. No",
term == "age_group40-59 years" ~ "Age Group: 40-59 vs. 20-39 yrs",
term == "age_group60-70 years" ~ "Age Group: 60-70 vs. 20-39 yrs",
term == "genderFemale" ~ "Gender: Female vs. Male",
term == "bmi" ~ "BMI (per 1 kg/m²)",
term == "income_poverty" ~ "Income-Poverty Ratio (per unit)",
term == "mental_health_binPoor" ~ "Mental Health: Poor vs. Good",
TRUE ~ term
),
term = factor(term, levels = rev(c(
"Physical Activity: Yes vs. No",
"Age Group: 40-59 vs. 20-39 yrs",
"Age Group: 60-70 vs. 20-39 yrs",
"Gender: Female vs. Male",
"BMI (per 1 kg/m²)",
"Income-Poverty Ratio (per unit)",
"Mental Health: Poor vs. Good"
))),
significant = ifelse(p.value < 0.05, "p < 0.05", "p >= 0.05")
)
ggplot(coef_data, aes(x = estimate, y = term, color = significant)) +
geom_vline(xintercept = 0, linetype = "dashed",
color = "grey50", linewidth = 0.8) +
geom_errorbarh(
aes(xmin = conf.low, xmax = conf.high),
height = 0.3, linewidth = 1
) +
geom_point(size = 4) +
scale_color_manual(
values = c("p < 0.05" = "#2E8B57", "p >= 0.05" = "#888888"),
name = "Significance"
) +
labs(
title = "Figure 2. Regression Coefficient Plot — Full Multivariable Model (Model 2)",
subtitle = "Survey-weighted linear regression; outcome = sleep duration (hours/night)",
x = "Estimated Coefficient beta (hours per night)",
y = NULL,
caption = paste0(
"Points = beta estimates. Horizontal bars = 95% CIs. ",
"Dashed line = null (beta = 0). Green = p < 0.05. ",
"N = ", nrow(nhanes_complete), "."
)
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 13),
plot.subtitle = element_text(size = 11),
axis.title.x = element_text(size = 12),
axis.text = element_text(size = 11),
legend.position = "bottom"
)
Figure 2. Coefficient plot from Model 2. Points = beta estimates; bars = 95% CIs. Green = p < 0.05; grey = p >= 0.05. Dashed line = null (beta = 0). N = 3,396.
sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] car_3.1-5 carData_3.0-6 GGally_2.4.0 moments_0.14.1
## [5] ggeffects_2.3.2 kableExtra_1.4.0 knitr_1.51 broom_1.0.12
## [9] gtsummary_2.5.0 survey_4.5 survival_3.7-0 Matrix_1.7-1
## [13] nhanesA_1.4.1 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## [17] dplyr_1.2.1 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [21] tibble_3.2.1 ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.3 farver_2.1.2
## [4] S7_0.2.1 fastmap_1.2.0 broom.helpers_1.22.0
## [7] labelled_2.16.0 digest_0.6.37 timechange_0.3.0
## [10] lifecycle_1.0.5 magrittr_2.0.3 compiler_4.4.2
## [13] rlang_1.1.7 sass_0.4.10 tools_4.4.2
## [16] yaml_2.3.10 gt_1.3.0 labeling_0.4.3
## [19] curl_7.0.0 plyr_1.8.9 xml2_1.5.2
## [22] RColorBrewer_1.1-3 abind_1.4-8 withr_3.0.2
## [25] foreign_0.8-87 datawizard_1.3.0 scales_1.4.0
## [28] insight_1.4.6 cli_3.6.3 rmarkdown_2.30
## [31] generics_0.1.4 otel_0.2.0 rstudioapi_0.18.0
## [34] httr_1.4.7 tzdb_0.4.0 commonmark_2.0.0
## [37] DBI_1.2.3 cachem_1.1.0 splines_4.4.2
## [40] rvest_1.0.5 selectr_0.5-1 mitools_2.4
## [43] vctrs_0.7.3 jsonlite_2.0.0 litedown_0.9
## [46] hms_1.1.4 Formula_1.2-5 systemfonts_1.3.1
## [49] jquerylib_0.1.4 glue_1.8.0 ggstats_0.12.0
## [52] stringi_1.8.4 gtable_0.3.6 pillar_1.11.1
## [55] htmltools_0.5.8.1 R6_2.6.1 textshaping_0.4.0
## [58] evaluate_1.0.5 lattice_0.22-6 haven_2.5.5
## [61] markdown_2.0 backports_1.5.0 cards_0.7.1
## [64] bslib_0.10.0 Rcpp_1.1.1 cardx_0.3.2
## [67] nlme_3.1-166 svglite_2.2.2 mgcv_1.9-1
## [70] xfun_0.56 fs_1.6.6 pkgconfig_2.0.3
Per the EPI 553 course AI policy, all AI interactions are documented below. Code was first attempted independently; ChatGPT was used only for debugging assistance and manuscript drafting. I am prepared to explain every line of code submitted.
Interaction 1
Prompt: My svyglm() model is throwing a contrasts error on mental_health_bin. The factor only has one level after filtering. How do I fix this?
Response summary: ChatGPT explained that haven-labelled vectors retain label attributes causing case_when() to produce a single-level factor if filtering occurs before factoring. The fix is to keep mental_health_bin as a character string through all filter() steps, then call factor() only after filtering is complete. This is implemented in Section 4 (complete-case step).
Interaction 2
Prompt: regTermTest() is not recognized for my Wald test on the interaction terms in svyglm.
Response summary: ChatGPT confirmed regTermTest() is from the survey package and requires library(survey) to be explicitly loaded. Fix reflected in the setup chunk.
Interaction 3
Prompt: Help me draft the Discussion section. Primary finding: physically active adults sleep 0.37 hours less (counter to hypothesis). Interaction Wald p = 0.908.
Response summary: ChatGPT suggested three interpretive frameworks: (1) time displacement, (2) reverse causality, (3) limitations of the single-item SLD012 measure. I reviewed, edited, and verified all factual claims against my check-in outputs before inclusion.