Setup and Data Preparation
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)
library(labelled)
library(scales)
library(survey)
library(srvyr)
library(gt)
library(patchwork)
library(scales)
library(labelled)
library(DiagrammeR)
library(DiagrammeRsvg)
library(rsvg)
library(tibble)
library(ggplot2)
library(flextable)
library(officer)
library(flextable)
options(survey.lonely.psu = "adjust")
options(scipen = 999)
set.seed(553)
Section 1: Data Loading and Analytical Sample
Seting working directory and path
project_folder <- "C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/NSCH Project Files"
data_folder <- file.path(project_folder, "data")
dir.create(file.path(project_folder, "outputs", "tables"), showWarnings = FALSE, recursive = TRUE)
dir.create(file.path(project_folder, "outputs", "figures"), showWarnings = FALSE, recursive = TRUE)
setwd(project_folder)
cat("Working directory:", getwd(), "\n")
## Working directory: C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/NSCH Project Files
Loading and stacking NSCH survey data (2020–2022)
data_folder <- "C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/NSCH Project Files/data"
nsch_2020 <- read_sas(file.path(data_folder, "nsch_2020_topical_SAS/nsch_2020e_topical.sas7bdat")) %>%
filter(SC_AGE_YEARS >= 2, SC_AGE_YEARS <= 17) %>%
mutate(YEAR = 2020)
nsch_2021 <- read_sas(file.path(data_folder, "nsch_2021_topical_SAS/nsch_2021e_topical.sas7bdat")) %>%
filter(SC_AGE_YEARS >= 2, SC_AGE_YEARS <= 17) %>%
mutate(YEAR = 2021)
nsch_2022 <- read_sas(file.path(data_folder, "nsch_2022_topical_SAS/nsch_2022e_topical.sas7bdat")) %>%
filter(SC_AGE_YEARS >= 2, SC_AGE_YEARS <= 17) %>%
mutate(YEAR = 2022)
# Stack years into one pooled dataset
nsch_combined <- bind_rows(nsch_2020, nsch_2021, nsch_2022)
cat("2020 N =", nrow(nsch_2020), "\n")
## 2020 N = 39738
cat("2021 N =", nrow(nsch_2021), "\n")
## 2021 N = 46488
cat("2022 N =", nrow(nsch_2022), "\n")
## 2022 N = 49659
cat("Combined N =", format(nrow(nsch_combined), big.mark = ","), "\n")
## Combined N = 135,885
rm(nsch_2020, nsch_2021, nsch_2022)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2828167 151.1 4640658 247.9 4640658 247.9
## Vcells 74794352 570.7 171931001 1311.8 144789963 1104.7
Starting sample size in the raw dataset (the combined raw N).
Analytical Sample (exclusion)
# Start: pooled raw dataset
n_start <- nrow(nsch_combined)
# Exclusion 1: Age restriction (2–17 years)
nsch_step1 <- nsch_combined %>%
filter(SC_AGE_YEARS >= 2, SC_AGE_YEARS <= 17)
n_step1 <- nrow(nsch_step1)
# Exclusion 2: Missing primary predictor (household language / English proficiency)
# Require HHLANGUAGE to be present.
# For non-English households (2 or 3), require SC_ENGLISH to classify language category.
nsch_step2 <- nsch_step1 %>%
filter(!is.na(HHLANGUAGE)) %>%
filter(!(HHLANGUAGE %in% c(2, 3) & is.na(SC_ENGLISH)))
n_step2 <- nrow(nsch_step2)
# Exclusion 3: Missing or zero survey weight
nsch_step3 <- nsch_step2 %>%
filter(!is.na(FWC), FWC > 0)
n_step3 <- nrow(nsch_step3)
# Final analytic dataset
nsch_analysis <- nsch_step3
n_final <- nrow(nsch_analysis)
# Exclusion table
exclusion_table <- tibble(
Step = c(
"1. Raw combined sample (2020–2022)",
"2. Exclude: age <2 or >17 years",
"3. Exclude: missing household language or English proficiency",
"4. Exclude: missing or zero survey weight (FWC)",
"5. Final analytic sample"
),
`N Remaining` = c(n_start, n_step1, n_step2, n_step3, n_final),
`N Dropped` = c(NA,
n_start - n_step1,
n_step1 - n_step2,
n_step2 - n_step3,
NA)
)
exclusion_table %>%
kable(
caption = "Table S1. Analytical Sample Construction — Exclusion Flowchart",
col.names = c("Step", "N Remaining", "N Dropped"),
format.args = list(big.mark = ","),
na = "—"
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
) %>%
row_spec(5, bold = TRUE, background = "#d4edda")
Table S1. Analytical Sample Construction — Exclusion Flowchart
|
Step
|
N Remaining
|
N Dropped
|
- Raw combined sample (2020–2022)
|
135,885
|
NA
|
- Exclude: age <2 or >17 years
|
135,885
|
0
|
- Exclude: missing household language or English proficiency
|
133,052
|
2,833
|
- Exclude: missing or zero survey weight (FWC)
|
133,052
|
0
|
- Final analytic sample
|
133,052
|
NA
|
Save Table_Flowchat
# ── SAVE OUTPUT ───────────────────────────────────────────────────
# 1) Save as HTML
exclusion_table %>%
kable(
caption = "Table S1. Analytical Sample Construction — Exclusion Flowchart",
col.names = c("Step", "N Remaining", "N Dropped"),
format.args = list(big.mark = ","),
na = "—",
format = "html"
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
) %>%
row_spec(5, bold = TRUE, background = "#d4edda") %>%
save_kable(
file = file.path(project_folder, "outputs/tables/TableS1_Exclusion_Flowchart.html")
)
# 2) Save as Word document
tableS1_ft <- exclusion_table %>%
mutate(
`N Remaining` = format(`N Remaining`, big.mark = ","),
`N Dropped` = ifelse(is.na(`N Dropped`), "—",
format(`N Dropped`, big.mark = ","))
) %>%
flextable::flextable() %>%
flextable::set_header_labels(
Step = "Step",
`N Remaining` = "N Remaining",
`N Dropped` = "N Dropped"
) %>%
flextable::bold(i = 5, part = "body") %>% # bold final row
flextable::bg(i = 5, bg = "#d4edda", part = "body") %>% # green final row
flextable::bg(i = 1, bg = "#f8f9fa", part = "header") %>%
flextable::font(fontname = "Times New Roman", part = "all") %>%
flextable::fontsize(size = 11, part = "all") %>%
flextable::autofit() %>%
flextable::theme_booktabs() %>%
flextable::set_caption(
"Table S1. Analytical Sample Construction — Exclusion Flowchart"
)
doc_s1 <- officer::read_docx() %>%
officer::body_add_par(
"Table S1. Analytical Sample Construction — Exclusion Flowchart",
style = "heading 2"
) %>%
officer::body_add_par(" ", style = "Normal") %>%
flextable::body_add_flextable(tableS1_ft) %>%
officer::body_add_par(" ", style = "Normal") %>%
officer::body_add_par(
"NSCH 2020–2022. Raw combined sample includes all children aged 2–17 years across three survey waves. Exclusions applied sequentially. Final analytic sample retains only children with valid household language classification and non-zero survey weight.",
style = "Normal"
)
print(
doc_s1,
target = file.path(project_folder, "outputs/tables/TableS1_Exclusion_Flowchart.docx")
)
cat("Table S1 saved as HTML and Word successfully.\n")
## Table S1 saved as HTML and Word successfully.
- Starting N -> Each exclusion criterion -> N dropped at each
step -> Final N
Create Consort-style flow diagram
library(DiagrammeR)
flowchart <- grViz(sprintf("
digraph flowchart {
graph [
layout = dot,
rankdir = TB,
labelloc = t,
label = <<B>Figure 1. Analytical Sample Selection</B><BR/>
<FONT POINT-SIZE='12'>
Household Language and Children's Mental and Behavioral Health<BR/>
National Survey of Children's Health (NSCH), 2020-2022
</FONT>>,
fontsize = 18
]
node [
shape = box,
style = filled,
fillcolor = '#f8f9fa',
fontname = Helvetica,
fontsize = 12,
width = 4
]
A [label = 'Raw Combined Sample (2020-2022)\\nN = %s']
B [label = 'Exclude: Age < 2 years\\nRemaining N = %s']
C [label = 'Exclude: Missing Household Language or English Proficiency\\nRemaining N = %s']
D [label = 'Exclude: Missing or Zero Survey Weight (FWC)\\nRemaining N = %s']
E [label = 'Final Analytical Sample for Analysis\\nN = %s',
fillcolor = '#d4edda',
fontsize = 13]
A -> B
B -> C
C -> D
D -> E
}
",
format(n_start, big.mark = ","),
format(n_step1, big.mark = ","),
format(n_step2, big.mark = ","),
format(n_step3, big.mark = ","),
format(n_final, big.mark = ",")
))
flowchart
svg <- export_svg(flowchart)
rsvg_png(charToRaw(svg),
file = "outputs/figures/Figure1_Analytical_Sample_Flow.png",
width = 1200,
height = 1800)
Creating a Working Subset
# Restrict to analytic variables and add survey design identifiers
nsch_subset <- nsch_analysis %>%
select(
# Survey design
STRATUM, HHID, FWC, YEAR,
# Outcomes (mental & behavioral health conditions)
K2Q31A, K2Q31B, # ADHD
K2Q32A, K2Q32B, # Depression
K2Q33A, K2Q33B, # Anxiety
K2Q34A, K2Q34B, # Behavioral/Conduct problems
K2Q35A, K2Q35B, # Autism
K2Q36A, K2Q36B, # Developmental delay
# Primary predictor
HHLANGUAGE, SC_ENGLISH,
# Moderators
METRO_YN, A1_MENTHEALTH, ACE1, ACE7,
# Covariates
SC_AGE_YEARS, SC_SEX, SC_RACE_R,
FPL_I1, A1_GRADE, FAMILY_R,
CURRINS, BORNUSA
)
cat("Final analytic subset dimensions:",
nrow(nsch_subset), "observations,",
ncol(nsch_subset), "variables\n")
## Final analytic subset dimensions: 133052 observations, 30 variables
Section 2: Variable Selection, Recoding, and Data Cleaning
Outcome Variable
nsch_clean <- nsch_analysis %>%
mutate(
# Each condition requires BOTH diagnosis (A) AND current impairment (B)
# NA in either source variable is treated as condition not present (0)
has_adhd = case_when(
K2Q31A == 1 & K2Q31B == 1 ~ 1,
is.na(K2Q31A) | is.na(K2Q31B) ~ 0,
TRUE ~ 0
),
has_depression = case_when(
K2Q32A == 1 & K2Q32B == 1 ~ 1,
is.na(K2Q32A) | is.na(K2Q32B) ~ 0,
TRUE ~ 0
),
has_anxiety = case_when(
K2Q33A == 1 & K2Q33B == 1 ~ 1,
is.na(K2Q33A) | is.na(K2Q33B) ~ 0,
TRUE ~ 0
),
has_behavioral = case_when(
K2Q34A == 1 & K2Q34B == 1 ~ 1,
is.na(K2Q34A) | is.na(K2Q34B) ~ 0,
TRUE ~ 0
),
has_autism = case_when(
K2Q35A == 1 & K2Q35B == 1 ~ 1,
is.na(K2Q35A) | is.na(K2Q35B) ~ 0,
TRUE ~ 0
),
has_devdelay = case_when(
K2Q36A == 1 & K2Q36B == 1 ~ 1,
is.na(K2Q36A) | is.na(K2Q36B) ~ 0,
TRUE ~ 0
),
# Composite primary outcome
impairing_mental_behav = ifelse(
has_anxiety == 1 | has_depression == 1 | has_adhd == 1 |
has_behavioral == 1 | has_autism == 1 | has_devdelay == 1, 1, 0),
# Secondary outcomes by condition subtype
internalizing = ifelse(has_anxiety == 1 | has_depression == 1, 1, 0),
externalizing = ifelse(has_adhd == 1 | has_behavioral == 1, 1, 0),
neurodevelopmental = ifelse(has_autism == 1 | has_devdelay == 1, 1, 0)
)
# Verify zero NAs on all outcome variables
cat("NA check on outcome variables:\n")
## NA check on outcome variables:
nsch_clean %>%
summarise(across(
c(has_adhd, has_depression, has_anxiety, has_behavioral,
has_autism, has_devdelay, impairing_mental_behav,
internalizing, externalizing, neurodevelopmental),
~sum(is.na(.))
)) %>%
pivot_longer(everything(),
names_to = "variable",
values_to = "n_missing") %>%
print()
## # A tibble: 10 × 2
## variable n_missing
## <chr> <int>
## 1 has_adhd 0
## 2 has_depression 0
## 3 has_anxiety 0
## 4 has_behavioral 0
## 5 has_autism 0
## 6 has_devdelay 0
## 7 impairing_mental_behav 0
## 8 internalizing 0
## 9 externalizing 0
## 10 neurodevelopmental 0
# Distribution of primary outcome
nsch_clean %>%
count(impairing_mental_behav) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
kable(
caption = "Distribution of Primary Outcome: Any Impairing Mental/Behavioral Condition",
col.names = c("Value", "N", "%"),
format.args = list(big.mark = ",")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Distribution of Primary Outcome: Any Impairing Mental/Behavioral
Condition
|
Value
|
N
|
%
|
|
0
|
104,458
|
78.5
|
|
1
|
28,594
|
21.5
|
Prints outcome variable as binary (0/1)
Primary Predictor: Household Language
# Household language (3-category factor)
# Derived from HHLANGUAGE (household language) and SC_ENGLISH
# (child English proficiency, asked only if HHLANGUAGE != 1)
nsch_clean <- nsch_clean %>%
mutate(
household_language = factor(case_when(
HHLANGUAGE == 1 ~ "English only",
HHLANGUAGE %in% c(2,3) & SC_ENGLISH %in% c(1, 2) ~ "Bilingual",
HHLANGUAGE %in% c(2,3) & SC_ENGLISH %in% c(3, 4) ~ "Non-English"
), levels = c("English only", "Bilingual", "Non-English"))
)
# Distribution of predictor
nsch_clean %>%
count(household_language) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
kable(
caption = "Distribution of Primary Predictor: Household Language",
col.names = c("Group", "N", "%"),
format.args = list(big.mark = ",")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Distribution of Primary Predictor: Household Language
|
Group
|
N
|
%
|
|
English only
|
124,361
|
93.5
|
|
Bilingual
|
7,831
|
5.9
|
|
Non-English
|
860
|
0.6
|
Covariates
nsch_clean <- nsch_clean %>%
mutate(
# Child age (continuous, years)
child_age = SC_AGE_YEARS,
# Age group (categorical)
age_group = factor(
case_when(
SC_AGE_YEARS >= 2 & SC_AGE_YEARS <= 5 ~ "2-5 years",
SC_AGE_YEARS >= 6 & SC_AGE_YEARS <= 11 ~ "6-11 years",
SC_AGE_YEARS >= 12 & SC_AGE_YEARS <= 17 ~ "12-17 years",
TRUE ~ NA_character_
),
levels = c("2-5 years", "6-11 years", "12-17 years")
),
# Child sex (binary)
child_sex = factor(
case_when(
SC_SEX == 1 ~ "Male",
SC_SEX == 2 ~ "Female",
TRUE ~ NA_character_
),
levels = c("Male", "Female")
),
# Child race (SC_RACE_R: detailed race categories)
child_race = factor(
case_when(
SC_RACE_R == 1 ~ "White alone",
SC_RACE_R == 2 ~ "Black or African American alone",
SC_RACE_R == 3 ~ "American Indian or Alaska Native alone",
SC_RACE_R == 4 ~ "Asian alone",
SC_RACE_R == 5 ~ "Native Hawaiian / Other Pacific Islander alone",
SC_RACE_R == 7 ~ "Two or more races",
TRUE ~ NA_character_
),
levels = c(
"White alone",
"Black or African American alone",
"American Indian or Alaska Native alone",
"Asian alone",
"Native Hawaiian / Other Pacific Islander alone",
"Two or more races"
)
),
# Household income as % Federal Poverty Level (4-category)
income_fpl = factor(
case_when(
FPL_I1 < 100 ~ "0-99% FPL",
FPL_I1 >= 100 & FPL_I1 < 200 ~ "100-199% FPL",
FPL_I1 >= 200 & FPL_I1 < 400 ~ "200-399% FPL",
FPL_I1 >= 400 ~ "400%+ FPL",
TRUE ~ NA_character_
),
levels = c("0-99% FPL", "100-199% FPL", "200-399% FPL", "400%+ FPL")
),
# Parent education (4-category)
parent_education = factor(
case_when(
A1_GRADE %in% c(1, 2) ~ "Less than HS",
A1_GRADE %in% c(3, 4) ~ "High school",
A1_GRADE %in% c(5, 6) ~ "Some college",
A1_GRADE %in% c(7, 8, 9) ~ "Bachelor's+",
TRUE ~ NA_character_
),
levels = c("Less than HS", "High school", "Some college", "Bachelor's+")
),
# Family structure (3-category)
family_structure = factor(
case_when(
FAMILY_R %in% c(1, 2, 3) ~ "Two parents",
FAMILY_R %in% c(4, 5, 6) ~ "Single parent",
FAMILY_R %in% c(7, 8) ~ "Other",
TRUE ~ NA_character_
),
levels = c("Two parents", "Single parent", "Other")
),
# Health insurance (binary)
has_insurance = factor(
case_when(
CURRINS == 1 ~ "Insured",
CURRINS == 2 ~ "Uninsured",
TRUE ~ NA_character_
),
levels = c("Insured", "Uninsured")
),
# Child nativity (binary)
child_nativity = factor(
case_when(
BORNUSA == 1 ~ "US-born",
BORNUSA == 2 ~ "Foreign-born",
TRUE ~ NA_character_
),
levels = c("US-born", "Foreign-born")
),
# Urbanicity (binary)
urban_rural = factor(
case_when(
METRO_YN == 1 ~ "Urban",
METRO_YN == 2 ~ "Rural",
TRUE ~ NA_character_
),
levels = c("Urban", "Rural")
),
# Caregiver mental health (moderator)
caregiver_mental_health = factor(
case_when(
A1_MENTHEALTH %in% c(1, 2, 3) ~ "Good (excellent/very good/good)",
A1_MENTHEALTH %in% c(4, 5) ~ "Poor (fair/poor)",
TRUE ~ NA_character_
),
levels = c("Good (excellent/very good/good)", "Poor (fair/poor)")
),
# --- ACE individual items (binary 0/1) ---
# ACE1: Hard to cover basics (4-point scale → binary)
# Somewhat often (3) or Very often (4) = exposed
# AC1 Economic hardship was originally a 4-point scale but it was
# dichotomized to exposed (1) and unexposed (0)
ace1_bin = case_when(
ACE1 %in% c(3, 4) ~ 1L,
ACE1 %in% c(1, 2) ~ 0L,
TRUE ~ NA_integer_
),
# ACE3–ACE10: Binary Yes(1)/No(2) items
ace3_bin = case_when(ACE3 == 1 ~ 1L, ACE3 == 2 ~ 0L, TRUE ~ NA_integer_), # Parent divorced/separated
ace4_bin = case_when(ACE4 == 1 ~ 1L, ACE4 == 2 ~ 0L, TRUE ~ NA_integer_), # Parent died
ace5_bin = case_when(ACE5 == 1 ~ 1L, ACE5 == 2 ~ 0L, TRUE ~ NA_integer_), # Parent jailed
ace6_bin = case_when(ACE6 == 1 ~ 1L, ACE6 == 2 ~ 0L, TRUE ~ NA_integer_), # Domestic violence
ace7_bin = case_when(ACE7 == 1 ~ 1L, ACE7 == 2 ~ 0L, TRUE ~ NA_integer_), # Victim/witness of violence
ace8_bin = case_when(ACE8 == 1 ~ 1L, ACE8 == 2 ~ 0L, TRUE ~ NA_integer_), # Lived w/ mentally ill
ace9_bin = case_when(ACE9 == 1 ~ 1L, ACE9 == 2 ~ 0L, TRUE ~ NA_integer_), # Lived w/ substance user
ace10_bin = case_when(ACE10 == 1 ~ 1L, ACE10 == 2 ~ 0L, TRUE ~ NA_integer_), # Racial discrimination
# ACE burden: continuous count score (0-9)
# na.rm = TRUE allows partial scoring across years where not all items were asked
ace_burden = rowSums(
cbind(ace1_bin, ace3_bin, ace4_bin, ace5_bin, ace6_bin,
ace7_bin, ace8_bin, ace9_bin, ace10_bin),
na.rm = TRUE
),
# ACE burden: categorical (for descriptive table)
ace_burden_cat = factor(
case_when(
ace_burden == 0 ~ "0 ACEs",
ace_burden == 1 ~ "1 ACE",
ace_burden == 2 ~ "2 ACEs",
ace_burden >= 3 ~ "3+ ACEs",
TRUE ~ NA_character_
),
levels = c("0 ACEs", "1 ACE", "2 ACEs", "3+ ACEs")
),
# Survey year
survey_year = factor(YEAR),
# Survey design variables
survey_weight = FWC,
STRATUM_F = factor(STRATUM),
HHID_F = factor(HHID)
)
# --- Continuous summary (age + ACE burden) ---
covariate_summary <- nsch_clean %>%
summarise(
`Mean age (SD)` = paste0(
round(mean(child_age, na.rm = TRUE), 1),
" (", round(sd(child_age, na.rm = TRUE), 1), ")"
),
`Mean ACE burden (SD)` = paste0(
round(mean(ace_burden, na.rm = TRUE), 2),
" (", round(sd(ace_burden, na.rm = TRUE), 2), ")"
)
)
cat("Covariate summary computed.\n")
## Covariate summary computed.
# --- Categorical covariate table (N and %) ---
cov_vars <- c(
"age_group", "child_sex", "child_race",
"income_fpl", "caregiver_mental_health", "parent_education", "family_structure",
"has_insurance", "child_nativity", "urban_rural", "ace_burden_cat", "survey_year"
)
covariate_table <- map_dfr(cov_vars, function(v) {
nsch_clean %>%
count(.data[[v]]) %>%
mutate(
variable = v,
pct = round(100 * n / sum(n), 1),
level = as.character(.data[[v]])
) %>%
select(variable, level, n, pct)
})
covariate_table %>%
kable(
caption = "Distribution of All Covariates (Unweighted)",
col.names = c("Variable", "Level", "N", "%"),
format.args = list(big.mark = ",")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
) %>%
collapse_rows(columns = 1, valign = "top")
Distribution of All Covariates (Unweighted)
|
Variable
|
Level
|
N
|
%
|
|
age_group
|
2-5 years
|
38,692
|
29.1
|
|
6-11 years
|
42,123
|
31.7
|
|
12-17 years
|
52,237
|
39.3
|
|
child_sex
|
Male
|
68,841
|
51.7
|
|
Female
|
64,211
|
48.3
|
|
child_race
|
White alone
|
102,263
|
76.9
|
|
Black or African American alone
|
9,601
|
7.2
|
|
American Indian or Alaska Native alone
|
1,294
|
1.0
|
|
Asian alone
|
7,601
|
5.7
|
|
Native Hawaiian / Other Pacific Islander alone
|
736
|
0.6
|
|
Two or more races
|
11,557
|
8.7
|
|
income_fpl
|
0-99% FPL
|
16,461
|
12.4
|
|
100-199% FPL
|
21,522
|
16.2
|
|
200-399% FPL
|
39,880
|
30.0
|
|
400%+ FPL
|
55,189
|
41.5
|
|
caregiver_mental_health
|
Good (excellent/very good/good)
|
120,056
|
90.2
|
|
Poor (fair/poor)
|
8,583
|
6.5
|
|
NA
|
4,413
|
3.3
|
|
parent_education
|
Less than HS
|
4,510
|
3.4
|
|
High school
|
21,574
|
16.2
|
|
Some college
|
31,783
|
23.9
|
|
Bachelor’s+
|
75,185
|
56.5
|
|
family_structure
|
Two parents
|
95,752
|
72.0
|
|
Single parent
|
28,776
|
21.6
|
|
Other
|
4,914
|
3.7
|
|
NA
|
3,610
|
2.7
|
|
has_insurance
|
Insured
|
126,663
|
95.2
|
|
Uninsured
|
5,795
|
4.4
|
|
NA
|
594
|
0.4
|
|
child_nativity
|
US-born
|
127,409
|
95.8
|
|
Foreign-born
|
4,370
|
3.3
|
|
NA
|
1,273
|
1.0
|
|
urban_rural
|
Urban
|
92,336
|
69.4
|
|
Rural
|
20,344
|
15.3
|
|
NA
|
20,372
|
15.3
|
|
ace_burden_cat
|
0 ACEs
|
84,399
|
63.4
|
|
1 ACE
|
26,405
|
19.8
|
|
2 ACEs
|
10,475
|
7.9
|
|
3+ ACEs
|
11,773
|
8.8
|
|
survey_year
|
2020
|
39,090
|
29.4
|
|
2021
|
45,447
|
34.2
|
|
2022
|
48,515
|
36.5
|
Missing Data Assessment
updated missing
# Assess missingness across all analytic variables
analytic_vars <- c(
"impairing_mental_behav", "internalizing", "externalizing",
"neurodevelopmental", "household_language",
"child_age", "child_sex", "age_group", "child_race",
"income_fpl", "caregiver_mental_health", "parent_education",
"family_structure", "has_insurance", "child_nativity",
"urban_rural", "survey_year",
"ace_burden", "ace_burden_cat"
)
missing_summary <- nsch_clean %>%
select(all_of(analytic_vars)) %>%
summarise(across(
everything(),
list(
n_miss = ~sum(is.na(.)),
pct_miss = ~round(100 * mean(is.na(.)), 2)
)
)) %>%
pivot_longer(
everything(),
names_to = c("variable", "stat"),
names_sep = "_(?=n_miss|pct_miss)"
) %>%
pivot_wider(names_from = stat, values_from = value) %>%
arrange(desc(pct_miss))
missing_summary %>%
kable(
caption = "Missing Data Summary — All Analytic Variables",
col.names = c("Variable", "N Missing", "% Missing"),
format.args = list(big.mark = ",")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
) %>%
row_spec(
which(missing_summary$pct_miss >= 10),
background = "#fff3cd"
) %>%
row_spec(
which(missing_summary$pct_miss == 0),
background = "#d4edda"
)
Missing Data Summary — All Analytic Variables
|
Variable
|
N Missing
|
% Missing
|
|
urban_rural
|
20,372
|
15.31
|
|
caregiver_mental_health
|
4,413
|
3.32
|
|
family_structure
|
3,610
|
2.71
|
|
child_nativity
|
1,273
|
0.96
|
|
has_insurance
|
594
|
0.45
|
|
impairing_mental_behav
|
0
|
0.00
|
|
internalizing
|
0
|
0.00
|
|
externalizing
|
0
|
0.00
|
|
neurodevelopmental
|
0
|
0.00
|
|
household_language
|
0
|
0.00
|
|
child_age
|
0
|
0.00
|
|
child_sex
|
0
|
0.00
|
|
age_group
|
0
|
0.00
|
|
child_race
|
0
|
0.00
|
|
income_fpl
|
0
|
0.00
|
|
parent_education
|
0
|
0.00
|
|
survey_year
|
0
|
0.00
|
|
ace_burden
|
0
|
0.00
|
|
ace_burden_cat
|
0
|
0.00
|
cat("Variables with >10% missing:\n")
## Variables with >10% missing:
missing_summary %>% filter(pct_miss >= 10) %>% print()
## # A tibble: 1 × 3
## variable n_miss pct_miss
## <chr> <dbl> <dbl>
## 1 urban_rural 20372 15.3
cat("\nVariables with 0% missing:\n")
##
## Variables with 0% missing:
missing_summary %>% filter(pct_miss == 0) %>% print()
## # A tibble: 14 × 3
## variable n_miss pct_miss
## <chr> <dbl> <dbl>
## 1 impairing_mental_behav 0 0
## 2 internalizing 0 0
## 3 externalizing 0 0
## 4 neurodevelopmental 0 0
## 5 household_language 0 0
## 6 child_age 0 0
## 7 child_sex 0 0
## 8 age_group 0 0
## 9 child_race 0 0
## 10 income_fpl 0 0
## 11 parent_education 0 0
## 12 survey_year 0 0
## 13 ace_burden 0 0
## 14 ace_burden_cat 0 0
Section 3: Descriptive Statistics (Table 1)
Table 1
# 1) Pooled weights (3 years)
nsch_clean <- nsch_clean %>%
dplyr::mutate(weight_pooled = survey_weight / 3)
stopifnot("weight_pooled" %in% names(nsch_clean))
# 2) Survey design object
nsch_design <- survey::svydesign(
ids = ~HHID_F,
strata = ~STRATUM_F,
weights = ~weight_pooled,
data = nsch_clean,
nest = TRUE
)
# 3) Custom p-value function (continuous vars; 3+ groups)
svy_wald_p <- function(data, variable, by, ...) {
fml <- stats::as.formula(paste(variable, "~", by))
fit <- survey::svyglm(fml, design = data)
p <- survey::regTermTest(fit, by)$p
tibble::tibble(p.value = p)
}
# 4) Build Table 1 (weighted)
table1 <- gtsummary::tbl_svysummary(
nsch_design,
by = household_language,
include = c(
impairing_mental_behav, internalizing, externalizing, neurodevelopmental,
child_age, child_sex, age_group, child_race,
income_fpl, caregiver_mental_health, parent_education, family_structure,
has_insurance, child_nativity, urban_rural, ace_burden, ace_burden_cat, survey_year
),
statistic = list(
gtsummary::all_continuous() ~ "{mean} ({sd})",
gtsummary::all_categorical() ~ "{n} ({p}%)"
),
digits = list(
gtsummary::all_continuous() ~ 1,
gtsummary::all_categorical() ~ 1
),
label = list(
impairing_mental_behav ~ "Any impairing mental/behavioral condition",
internalizing ~ " Internalizing (anxiety or depression)",
externalizing ~ " Externalizing (ADHD or behavioral problems)",
neurodevelopmental ~ " Neurodevelopmental (autism or developmental delay)",
child_age ~ "Child age (years)",
child_sex ~ "Child sex",
age_group ~ "Age group",
child_race ~ "Child race",
income_fpl ~ "Household income (% Federal Poverty Level)",
caregiver_mental_health ~ "Caregiver mental health",
parent_education ~ "Primary caregiver education",
family_structure ~ "Family structure",
has_insurance ~ "Health insurance status",
child_nativity ~ "Child nativity",
urban_rural ~ "Urbanicity",
ace_burden ~ "ACE burden score (continuous, 0-9)",
ace_burden_cat ~ "ACE burden category",
survey_year ~ "Survey year"
),
missing = "no"
) %>%
gtsummary::add_overall(last = FALSE) %>%
gtsummary::add_p(
test = list(
gtsummary::all_continuous() ~ "svy_wald_p",
gtsummary::all_categorical() ~ "svy.chisq.test"
),
pvalue_fun = gtsummary::label_style_pvalue(digits = 3)
) %>%
gtsummary::bold_labels() # correctly piped — not floating
# 5) Subtitle numbers (unweighted + weighted)
unw_total <- nrow(nsch_clean)
unw_eng <- sum(nsch_clean$household_language == "English only", na.rm = TRUE)
unw_bil <- sum(nsch_clean$household_language == "Bilingual", na.rm = TRUE)
unw_noeng <- sum(nsch_clean$household_language == "Non-English", na.rm = TRUE)
wt_total <- round(sum(nsch_clean$weight_pooled, na.rm = TRUE) / 1e6, 1)
wt_eng <- round(sum(nsch_clean$weight_pooled[nsch_clean$household_language == "English only"], na.rm = TRUE) / 1e6, 1)
wt_bil <- round(sum(nsch_clean$weight_pooled[nsch_clean$household_language == "Bilingual"], na.rm = TRUE) / 1e6, 1)
wt_noeng <- round(sum(nsch_clean$weight_pooled[nsch_clean$household_language == "Non-English"], na.rm = TRUE) / 1e6, 1)
# 6) Render + save HTML
table1_gt <- table1 %>%
gtsummary::as_gt() %>%
gt::tab_header(
title = "Table 1. Descriptive Characteristics of the Analytic Sample by Household Language",
subtitle = paste0(
"NSCH 2020-2022 | Unweighted N = ", format(unw_total, big.mark = ","),
" | Weighted population (avg annual) = ", wt_total, " million",
" | English only: n = ", format(unw_eng, big.mark = ","), " (", wt_eng, "M)",
"; Bilingual: n = ", format(unw_bil, big.mark = ","), " (", wt_bil, "M)",
"; Non-English: n = ", format(unw_noeng, big.mark = ","), " (", wt_noeng, "M)"
)
) %>%
gt::tab_source_note(
source_note = gt::md(
"Values are weighted mean (SD) for continuous variables and weighted n (%) for categorical variables.
P-values are design-adjusted: Rao-Scott chi-square tests for categorical variables and Wald tests from survey-weighted regression for continuous variables.
ACE burden is the count of 9 adverse childhood experience items (range 0-9); ACE1 dichotomized at 'somewhat/very often'.
Pooled weights were computed as annual weights / 3 for combined 2020-2022 analyses."
)
) %>%
gt::opt_row_striping() %>%
gt::tab_options(
table.font.size = 12,
heading.title.font.size = 14,
heading.subtitle.font.size = 11,
data_row.padding = gt::px(4)
)
gt::gtsave(table1_gt,
file = file.path(project_folder, "outputs/tables/table1_by_language.html"))
table1_gt
| Table 1. Descriptive Characteristics of the Analytic Sample by Household Language |
| NSCH 2020-2022 | Unweighted N = 133,052 | Weighted population (avg annual) = 63.5 million | English only: n = 124,361 (55.1M); Bilingual: n = 7,831 (7.4M); Non-English: n = 860 (1M) |
| Characteristic |
Overall
N = 63,506,861 |
English only
N = 55,117,665 |
Bilingual
N = 7,413,253 |
Non-English
N = 975,944 |
p-value |
| Any impairing mental/behavioral condition |
12,461,074.2 (19.6%) |
11,652,544.6 (21.1%) |
668,150.7 (9.0%) |
140,378.9 (14.4%) |
<0.001 |
| Internalizing (anxiety or depression) |
6,393,064.8 (10.1%) |
5,985,631.1 (10.9%) |
374,493.5 (5.1%) |
32,940.3 (3.4%) |
<0.001 |
| Externalizing (ADHD or behavioral problems) |
7,641,799.3 (12.0%) |
7,223,207.9 (13.1%) |
338,236.1 (4.6%) |
80,355.3 (8.2%) |
<0.001 |
| Neurodevelopmental (autism or developmental delay) |
4,266,919.3 (6.7%) |
3,950,479.7 (7.2%) |
200,100.5 (2.7%) |
116,339.1 (11.9%) |
<0.001 |
| Child age (years) |
9.8 (4.5) |
9.6 (4.6) |
11.1 (3.8) |
7.4 (3.8) |
<0.001 |
| Child sex |
|
|
|
|
0.229 |
| Male |
32,385,772.1 (51.0%) |
28,140,779.7 (51.1%) |
3,698,042.9 (49.9%) |
546,949.4 (56.0%) |
|
| Female |
31,121,088.7 (49.0%) |
26,976,884.8 (48.9%) |
3,715,209.7 (50.1%) |
428,994.2 (44.0%) |
|
| Age group |
|
|
|
|
<0.001 |
| 2-5 years |
14,337,576.6 (22.6%) |
13,244,494.3 (24.0%) |
646,547.9 (8.7%) |
446,534.4 (45.8%) |
|
| 6-11 years |
24,017,499.4 (37.8%) |
20,481,261.6 (37.2%) |
3,180,905.0 (42.9%) |
355,332.8 (36.4%) |
|
| 12-17 years |
25,151,784.9 (39.6%) |
21,391,908.7 (38.8%) |
3,585,799.7 (48.4%) |
174,076.5 (17.8%) |
|
| Child race |
|
|
|
|
<0.001 |
| White alone |
45,024,781.7 (70.9%) |
38,984,982.3 (70.7%) |
5,322,348.9 (71.8%) |
717,450.5 (73.5%) |
|
| Black or African American alone |
9,211,136.6 (14.5%) |
8,778,862.8 (15.9%) |
391,154.1 (5.3%) |
41,119.8 (4.2%) |
|
| American Indian or Alaska Native alone |
933,066.4 (1.5%) |
676,950.8 (1.2%) |
223,406.8 (3.0%) |
32,708.8 (3.4%) |
|
| Asian alone |
3,167,184.2 (5.0%) |
1,983,944.5 (3.6%) |
1,043,663.7 (14.1%) |
139,576.1 (14.3%) |
|
| Native Hawaiian / Other Pacific Islander alone |
555,660.5 (0.9%) |
295,800.1 (0.5%) |
228,501.5 (3.1%) |
31,358.9 (3.2%) |
|
| Two or more races |
4,615,031.4 (7.3%) |
4,397,124.1 (8.0%) |
204,177.7 (2.8%) |
13,729.6 (1.4%) |
|
| Household income (% Federal Poverty Level) |
|
|
|
|
<0.001 |
| 0-99% FPL |
11,367,759.7 (17.9%) |
8,482,758.4 (15.4%) |
2,473,575.0 (33.4%) |
411,426.3 (42.2%) |
|
| 100-199% FPL |
13,019,651.2 (20.5%) |
10,335,712.7 (18.8%) |
2,361,491.4 (31.9%) |
322,447.1 (33.0%) |
|
| 200-399% FPL |
18,556,161.1 (29.2%) |
16,797,650.1 (30.5%) |
1,587,690.0 (21.4%) |
170,820.9 (17.5%) |
|
| 400%+ FPL |
20,563,288.8 (32.4%) |
19,501,543.3 (35.4%) |
990,496.2 (13.4%) |
71,249.3 (7.3%) |
|
| Caregiver mental health |
|
|
|
|
<0.001 |
| Good (excellent/very good/good) |
56,596,561.7 (93.1%) |
48,979,295.5 (92.7%) |
6,718,058.6 (95.8%) |
899,207.6 (95.0%) |
|
| Poor (fair/poor) |
4,222,473.3 (6.9%) |
3,882,737.9 (7.3%) |
292,333.8 (4.2%) |
47,401.6 (5.0%) |
|
| Primary caregiver education |
|
|
|
|
<0.001 |
| Less than HS |
7,134,986.5 (11.2%) |
3,540,925.1 (6.4%) |
3,105,393.2 (41.9%) |
488,668.2 (50.1%) |
|
| High school |
13,880,949.6 (21.9%) |
11,921,178.1 (21.6%) |
1,736,522.2 (23.4%) |
223,249.3 (22.9%) |
|
| Some college |
13,102,039.4 (20.6%) |
12,151,865.3 (22.0%) |
868,514.7 (11.7%) |
81,659.4 (8.4%) |
|
| Bachelor's+ |
29,388,885.3 (46.3%) |
27,503,696.0 (49.9%) |
1,702,822.6 (23.0%) |
182,366.7 (18.7%) |
|
| Family structure |
|
|
|
|
<0.001 |
| Two parents |
42,321,892.8 (69.0%) |
36,797,907.3 (69.1%) |
4,946,999.9 (69.6%) |
576,985.6 (60.6%) |
|
| Single parent |
16,016,035.2 (26.1%) |
13,692,785.0 (25.7%) |
2,001,731.9 (28.1%) |
321,518.3 (33.8%) |
|
| Other |
3,002,133.6 (4.9%) |
2,785,384.9 (5.2%) |
162,671.7 (2.3%) |
54,077.0 (5.7%) |
|
| Health insurance status |
|
|
|
|
<0.001 |
| Insured |
58,858,804.0 (93.3%) |
51,846,033.7 (94.6%) |
6,297,997.6 (85.8%) |
714,772.7 (74.6%) |
|
| Uninsured |
4,251,660.2 (6.7%) |
2,967,318.7 (5.4%) |
1,041,336.4 (14.2%) |
243,005.0 (25.4%) |
|
| Child nativity |
|
|
|
|
<0.001 |
| US-born |
59,626,942.7 (95.1%) |
53,144,170.7 (97.6%) |
5,894,519.3 (80.5%) |
588,252.7 (61.8%) |
|
| Foreign-born |
3,071,038.5 (4.9%) |
1,280,048.6 (2.4%) |
1,426,707.8 (19.5%) |
364,282.1 (38.2%) |
|
| Urbanicity |
|
|
|
|
<0.001 |
| Urban |
51,265,204.7 (87.4%) |
43,888,436.8 (86.3%) |
6,535,387.1 (94.7%) |
841,380.8 (91.2%) |
|
| Rural |
7,422,160.0 (12.6%) |
6,976,375.3 (13.7%) |
364,316.5 (5.3%) |
81,468.1 (8.8%) |
|
| ACE burden score (continuous, 0-9) |
0.8 (1.3) |
0.8 (1.3) |
0.5 (0.9) |
0.5 (0.9) |
<0.001 |
| ACE burden category |
|
|
|
|
<0.001 |
| 0 ACEs |
38,157,489.9 (60.1%) |
32,709,152.7 (59.3%) |
4,769,265.7 (64.3%) |
679,071.5 (69.6%) |
|
| 1 ACE |
13,960,413.6 (22.0%) |
11,975,665.4 (21.7%) |
1,781,320.1 (24.0%) |
203,428.2 (20.8%) |
|
| 2 ACEs |
5,473,199.6 (8.6%) |
4,891,229.0 (8.9%) |
516,536.7 (7.0%) |
65,433.9 (6.7%) |
|
| 3+ ACEs |
5,915,757.6 (9.3%) |
5,541,617.5 (10.1%) |
346,130.1 (4.7%) |
28,010.0 (2.9%) |
|
| Survey year |
|
|
|
|
0.676 |
| 2020 |
21,133,243.9 (33.3%) |
18,379,482.8 (33.3%) |
2,449,804.9 (33.0%) |
303,956.2 (31.1%) |
|
| 2021 |
21,046,813.5 (33.1%) |
18,176,221.6 (33.0%) |
2,504,523.9 (33.8%) |
366,068.0 (37.5%) |
|
| 2022 |
21,326,803.4 (33.6%) |
18,561,960.2 (33.7%) |
2,458,923.8 (33.2%) |
305,919.4 (31.3%) |
|
Values are weighted mean (SD) for continuous variables and weighted n (%) for categorical variables.
P-values are design-adjusted: Rao-Scott chi-square tests for categorical variables and Wald tests from survey-weighted regression for continuous variables.
ACE burden is the count of 9 adverse childhood experience items (range 0-9); ACE1 dichotomized at ‘somewhat/very often’.
Pooled weights were computed as annual weights / 3 for combined 2020-2022 analyses. |
# 7) Export Table 1 to Word
library(officer) # fixes: could not find function "read_docx"
library(flextable)
# Convert gtsummary table to flextable
table1_ft <- table1 %>%
gtsummary::as_flex_table() %>%
flextable::autofit() %>%
flextable::font(fontname = "Times New Roman", part = "all") %>%
flextable::fontsize(size = 10, part = "all") %>%
flextable::theme_booktabs()
# Build Word document
doc <- officer::read_docx()
doc <- officer::body_add_par(
doc,
"Table 1. Descriptive Characteristics of the Analytic Sample by Household Language (NSCH 2020-2022)",
style = "heading 2"
)
doc <- officer::body_add_par(doc, " ", style = "Normal")
doc <- flextable::body_add_flextable(doc, table1_ft)
doc <- officer::body_add_par(doc, " ", style = "Normal")
doc <- officer::body_add_par(
doc,
"Values are weighted mean (SD) for continuous variables and weighted n (%) for categorical variables. P-values are design-adjusted: Rao-Scott chi-square tests for categorical variables and Wald tests from survey-weighted regression for continuous variables. ACE burden is the count of 9 adverse childhood experience items (range 0-9); ACE1 dichotomized at 'somewhat/very often'. Pooled weights computed as annual weights / 3 for combined 2020-2022 analyses.",
style = "Normal"
)
# Save
print(
doc,
target = file.path(project_folder, "outputs/tables/Table1_NSCH_Descriptives.docx")
)
cat("Table 1 saved to Word successfully.\n")
## Table 1 saved to Word successfully.
svymean(~is.na(caregiver_mental_health), nsch_design)
## mean SE
## is.na(caregiver_mental_health)FALSE 0.957677 0.0013
## is.na(caregiver_mental_health)TRUE 0.042323 0.0013
** Caregiver mental health was missing for approximately 4.2% of the
weighted sample and will be excluded from interaction analyses using
complete cases only
Section 4: Exploratory Data Analysis (EDA)
EDA Set Up
# ── EDA SETUP ────────────────────────────────────────────────────────────────
theme_eda <- theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 13),
plot.subtitle = element_text(size = 11, color = "gray40"),
axis.title = element_text(size = 11),
axis.text = element_text(size = 10),
legend.position = "bottom",
panel.grid.minor = element_blank(),
plot.caption = element_text(size = 9, color = "gray50")
)
lang_colors <- c(
"English only" = "#C0392B",
"Bilingual" = "#8E44AD",
"Non-English" = "#27AE60"
)
nsch_clean <- nsch_clean %>%
mutate(weight_pooled = survey_weight / 3)
nsch_design <- svydesign(
ids = ~HHID_F,
strata = ~STRATUM_F,
weights = ~weight_pooled,
data = nsch_clean,
nest = TRUE
)
options(survey.lonely.psu = "adjust")
cat("EDA setup complete.\n")
## EDA setup complete.
Interpretation: This block establishes a consistent visual theme and
color palette for all EDA figures, ensuring figures are
publication-ready and visually coherent. The survey design object
(nsch_design) correctly incorporates NSCH’s complex survey structure —
stratification, clustering by household, and pooled annual weights, so
that all subsequent estimates are nationally representative. Failing to
account for the complex design would produce artificially narrow
confidence intervals and potentially biased prevalence estimates.
Required Visualization
Figure 1a — Weighted Prevalence of the Binary Outcome
# ── Figure 1: Outcome distribution (binary) ──────────────────────────────────
# Design-based prevalence estimate
est1 <- svymean(~impairing_mental_behav, nsch_design, na.rm = TRUE)
p1 <- as.numeric(coef(est1))
se1 <- as.numeric(SE(est1))
fig1_df <- tibble::tibble(
Outcome = c("No impairing condition", "Any impairing condition"),
Prev = c(1 - p1, p1),
SE = c(se1, se1)
) %>%
mutate(
low = pmax(Prev - 1.96 * SE, 0),
high = pmin(Prev + 1.96 * SE, 1),
pct = round(Prev * 100, 1)
)
fig1a <- ggplot(fig1_df, aes(x = Outcome, y = Prev, fill = Outcome)) +
geom_col(width = 0.60, color = "white") +
geom_errorbar(aes(ymin = low, ymax = high), width = 0.18, linewidth = 0.8) +
geom_text(aes(label = paste0(pct, "%")), vjust = -0.7,
fontface = "bold", size = 5) +
scale_y_continuous(labels = percent_format(accuracy = 1),
limits = c(0, 1),
expand = expansion(mult = c(0, 0.10))) +
scale_fill_manual(values = c(
"No impairing condition" = "gray75",
"Any impairing condition" = "#B22222"
)) +
labs(
title = "Figure 1a. Weighted Distribution of Any Currently Impairing Condition",
subtitle = "NSCH 2020–2022 (children ages 2–17), survey-weighted estimate",
x = NULL,
y = "Weighted proportion of children",
caption = "Error bars = 95% CI from complex survey design using pooled weights (FWC/3)."
)
fig1a

ggsave(file.path(project_folder, "outputs/figures/Figure1_Binary_Outcome.png"),
fig1a, width = 9, height = 5.5, dpi = 300, bg = "white")
The weighted prevalence of any currently impairing mental or
behavioral condition in the analytic sample is 19.6%, meaning
approximately one in five children meets the case definition. The
remaining 80.4% of children do not have an active impairing condition at
the time of the survey. Because the outcome is binary, it does not
follow a normal distribution and cannot be evaluated for skewness in the
same way as a continuous variable. This confirms that logistic
regression, rather than linear regression, is the appropriate modeling
strategy for subsequent analyses.
The outcome is binary, so its distribution is best summarized as a
weighted prevalence rather than a histogram. Approximately one in five
U.S. children (about 19–20%) meets the study’s case definition of a
currently impairing condition, while about four in five do not. This
confirms that logistic regression is appropriate for modeling the
outcome in later sections. The relatively common prevalence also implies
odds ratios may differ from risk ratios, so interpretation should be
careful.
Figure 1b — Histogram of Concurrent Condition Count (0–6)
# Build additive condition count variable (0–6)
nsch_clean <- nsch_clean %>%
mutate(condition_count = has_adhd + has_depression + has_anxiety +
has_behavioral + has_autism + has_devdelay)
count_df <- nsch_clean %>%
count(condition_count) %>%
mutate(pct = 100 * n / sum(n))
fig1b <- ggplot(count_df, aes(x = factor(condition_count), y = pct,
fill = factor(condition_count))) +
geom_col(color = "white", show.legend = FALSE) +
geom_text(aes(label = paste0(round(pct, 1), "%")),
vjust = -0.5, size = 3.8) +
scale_fill_manual(values = c("#BDC3C7", "#F1948A", "#E74C3C", "#C0392B",
"#922B21", "#6E1C1C", "#4A0E0E")) +
scale_y_continuous(labels = label_number(suffix = "%"),
expand = expansion(mult = c(0, 0.1))) +
labs(
title = "Figure 1b. Number of Concurrent Impairing Conditions per Child (Unweighted)",
subtitle = "Distribution is strongly right-skewed; most children have 0 or 1 condition",
x = "Number of Impairing Conditions (0–6)",
y = "Percent of Children",
caption = "Conditions: ADHD, depression, anxiety, behavioral problems, autism, developmental delay."
) +
theme_eda
fig1b

ggsave("outputs/figures/Figure2b_Condition_Count_Histogram.png",
fig1b, width = 8, height = 5, dpi = 300)
Interpretation: The distribution of concurrent impairing conditions
is strongly right-skewed: the overwhelming majority of children have
zero conditions, a meaningful minority have exactly one, and very few
children carry two or more simultaneous diagnoses. This floor effect
(mass at zero) would severely violate the normality assumption if the
count were modeled as a continuous outcome, requiring at minimum a
zero-inflated Poisson or negative-binomial approach. The strong skew
confirms that collapsing the count into a binary “any vs. none”
composite is both statistically appropriate and clinically meaningful
for the primary analysis. Children with multiple concurrent conditions
represent a small but high-need subgroup worth examining in sensitivity
analyses.
Figure 1c — Weighted Prevalence by Individual Condition
condition_vars <- c("has_adhd", "has_depression", "has_anxiety",
"has_behavioral", "has_autism", "has_devdelay")
condition_labels <- c("ADHD", "Depression", "Anxiety",
"Behavioral\nProblems", "Autism", "Developmental\nDelay")
cond_prev <- map2_dfr(condition_vars, condition_labels, function(var, lab) {
est <- svymean(as.formula(paste0("~", var)), nsch_design, na.rm = TRUE)
data.frame(
Condition = lab,
Prevalence = as.numeric(coef(est)), # coerce to plain numeric
SE = as.numeric(SE(est)) # coerce to plain numeric — THIS was the fix
)
}) %>%
mutate(
lower = pmax(Prevalence - 1.96 * SE, 0),
upper = Prevalence + 1.96 * SE,
Condition = fct_reorder(Condition, Prevalence, .desc = TRUE)
)
fig1c <- ggplot(cond_prev, aes(x = Condition, y = Prevalence)) +
geom_col(fill = "#C0392B", width = 0.6, color = "white") +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, linewidth = 0.8) +
geom_text(aes(label = paste0(round(Prevalence * 100, 1), "%")),
vjust = -0.9, size = 3.8, fontface = "bold") +
scale_y_continuous(labels = percent_format(),
expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Figure 1c. Weighted Prevalence of Each Mental/Behavioral Condition",
subtitle = "NSCH 2020–2022; each condition requires diagnosis + current impairment",
x = NULL,
y = "Weighted Prevalence",
caption = "Error bars = 95% CI (design-adjusted)."
) +
theme_eda
fig1c

ggsave("outputs/figures/Figure2c_Condition_Prevalence.png",
fig1c, width = 8, height = 5, dpi = 300)
Interpretation: ADHD and anxiety are the two most prevalent
individual conditions, each affecting roughly 9–11% of children
nationally, while depression, behavioral problems, autism, and
developmental delay are each below 7%. This hierarchy is consistent with
published national epidemiological benchmarks (e.g., CDC ADHD
surveillance data), which validates the NSCH coding applied in Section
2. Practically, because ADHD and anxiety together drive the composite
outcome, the composite will be most sensitive to factors that predict
those two conditions; associations with rarer diagnoses (autism,
developmental delay) may be diluted and are better examined through the
neurodevelopmental secondary outcome. The non-overlapping confidence
intervals between ADHD/anxiety and the rarer conditions confirm that
these prevalence differences are statistically reliable.
Figure 2 — Main bivariate association: outcome by household language
(weighted + CI)
# ── Figure 2: Outcome by household language (weighted + 95% CI) ──────────────
prev_lang <- svyby(
~impairing_mental_behav,
~household_language,
design = nsch_design,
FUN = svymean,
na.rm = TRUE,
vartype = c("ci")
) %>%
as.data.frame() %>%
transmute(
household_language,
prev = impairing_mental_behav,
low = ci_l,
high = ci_u,
prev_pct = 100 * prev,
low_pct = 100 * low,
high_pct = 100 * high
)
ymax <- max(prev_lang$high_pct, na.rm = TRUE) * 1.25
fig2 <- ggplot(prev_lang,
aes(x = household_language, y = prev_pct, fill = household_language)) +
geom_col(width = 0.62, color = "white") +
geom_errorbar(aes(ymin = low_pct, ymax = high_pct),
width = 0.16, linewidth = 0.85, color = "gray25") +
geom_text(aes(label = paste0(round(prev_pct, 1), "%")),
vjust = -0.8, fontface = "bold", size = 5) +
scale_fill_manual(values = lang_colors) +
scale_y_continuous(
limits = c(0, ymax),
labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0.06))
) +
labs(
title = "Figure 2. Any Impairing Condition by Household Language Environment",
subtitle = "Survey-weighted prevalence with 95% confidence intervals (NSCH 2020–2022)",
x = "Household language group",
y = "Weighted prevalence (%)",
caption = "Error bars = 95% CI accounting for NSCH stratification, clustering, and pooled weights (FWC/3)."
)
fig2

ggsave(file.path(project_folder, "outputs/figures/Figure2_Outcome_by_Language.png"),
fig2, width = 9.5, height = 6, dpi = 300, bg = "white")
Interpretation: Children in bilingual and
non-English households show lower crude survey-weighted prevalence of an
impairing condition compared with children in English-only households.
This pattern is consistent with the expected direction from the
immigrant health paradox literature. However, these are unadjusted
comparisons and may reflect differences in socioeconomic composition or
barriers to diagnosis and screening across language groups. The
regression models will test whether this gap persists after controlling
for confounders.
Figure 3 — Subtype prevalence by language (weighted + CI)
# ── Figure 3: Condition subtypes by language (weighted + 95% CI) ─────────────
get_prev <- function(varname) {
out <- svyby(
as.formula(paste0("~", varname)),
~household_language,
design = nsch_design,
FUN = svymean,
na.rm = TRUE,
vartype = c("ci")
) %>% as.data.frame()
dplyr::tibble(
household_language = out$household_language,
subtype = varname,
prev = out[[varname]],
low = out$ci_l,
high = out$ci_u
)
}
subtype_df <- dplyr::bind_rows(
get_prev("internalizing"),
get_prev("externalizing"),
get_prev("neurodevelopmental")
) %>%
mutate(
subtype_label = dplyr::case_when(
subtype == "internalizing" ~ "Internalizing (anxiety/depression)",
subtype == "externalizing" ~ "Externalizing (ADHD/behavioral)",
subtype == "neurodevelopmental" ~ "Neurodevelopmental (autism/dev. delay)",
TRUE ~ subtype
),
prev_pct = 100 * prev,
low_pct = 100 * low,
high_pct = 100 * high,
subtype_label = factor(
subtype_label,
levels = c("Internalizing (anxiety/depression)",
"Externalizing (ADHD/behavioral)",
"Neurodevelopmental (autism/dev. delay)")
)
)
ymax3 <- max(subtype_df$high_pct, na.rm = TRUE) * 1.25
fig3 <- ggplot(subtype_df,
aes(x = subtype_label, y = prev_pct, fill = household_language)) +
geom_col(position = position_dodge(width = 0.75),
width = 0.65, color = "white") +
geom_errorbar(aes(ymin = low_pct, ymax = high_pct),
position = position_dodge(width = 0.75),
width = 0.18, linewidth = 0.8, color = "gray25") +
geom_text(aes(label = paste0(round(prev_pct, 1), "%")),
position = position_dodge(width = 0.75),
vjust = -0.75, size = 3.6, fontface = "bold") +
scale_fill_manual(values = lang_colors, name = "Household language") +
scale_y_continuous(
limits = c(0, ymax3),
labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0, 0.06))
) +
labs(
title = "Figure 3. Condition Subtype Prevalence by Household Language",
subtitle = "Survey-weighted prevalence with 95% confidence intervals (NSCH 2020–2022)",
x = "Outcome subtype",
y = "Weighted prevalence (%)",
caption = "Subtypes are derived from the same diagnosis + current impairment rule.\nError bars = 95% CI accounting for NSCH complex survey design and pooled weights (FWC/3)."
)
fig3

ggsave(file.path(project_folder, "outputs/figures/Figure3_Subtypes_by_Language.png"),
fig3, width = 12, height = 6, dpi = 300, bg = "white")
Interpretation: The language gradient differs by subtype: the
contrast between English-only and bilingual/non-English households may
be larger for some condition categories than others. This matters
because your composite outcome can be driven mostly by the more common
subtypes (often internalizing or externalizing), while
neurodevelopmental conditions are less common and often have wider
uncertainty. If subtype patterns differ meaningfully, it supports your
plan to run subtype-specific secondary models. These bivariate results
still require adjusted regression to separate language effects from
confounding and diagnostic access differences.
Figure 4: Outcome by age group x language (weighted + CI)
# ── Optional Figure 4: Outcome by age group x language (weighted + CI) ───────
prev_age_lang <- svyby(
~impairing_mental_behav,
~age_group + household_language,
design = nsch_design,
FUN = svymean,
na.rm = TRUE,
vartype = c("ci")
) %>%
as.data.frame() %>%
mutate(
prev_pct = 100 * impairing_mental_behav,
low_pct = 100 * ci_l,
high_pct = 100 * ci_u
)
ymax4 <- max(prev_age_lang$high_pct, na.rm = TRUE) * 1.25
fig4 <- ggplot(prev_age_lang,
aes(x = age_group, y = prev_pct,
color = household_language, group = household_language)) +
geom_line(linewidth = 1.1) +
geom_point(size = 3.2) +
geom_errorbar(aes(ymin = low_pct, ymax = high_pct),
width = 0.10, linewidth = 0.7, alpha = 0.9) +
geom_text(aes(label = paste0(round(prev_pct, 1), "%")),
vjust = -0.9, size = 3.4, show.legend = FALSE) +
scale_color_manual(values = lang_colors, name = "Household language") +
scale_y_continuous(
limits = c(0, ymax4),
labels = function(x) paste0(x, "%"),
expand = expansion(mult = c(0.02, 0.06))
) +
labs(
title = "Figure 4. Any Impairing Condition by Age Group and Household Language",
subtitle = "Survey-weighted prevalence with 95% confidence intervals (NSCH 2020–2022)",
x = "Age group (years)",
y = "Weighted prevalence (%)",
caption = "Error bars = 95% CI from complex survey design using pooled weights (FWC/3)."
)
fig4

ggsave(file.path(project_folder, "outputs/figures/Figure4_Age_by_Language.png"),
fig4, width = 10, height = 6, dpi = 300, bg = "white")
Across all three household language environments, the prevalence of
any currently impairing mental or behavioral condition increases
substantially with age. This age gradient is expected epidemiologically,
as internalizing disorders such as anxiety and depression typically
emerge more clearly in middle childhood and adolescence, and diagnostic
recognition increases with school exposure and health system
contact.
Among children ages 2–5 years, prevalence is relatively similar
across English-only (9.1%) and non-English (9.4%) households, with
bilingual households lower (4.1%). At this early developmental stage,
confidence intervals overlap considerably, suggesting limited
differentiation by language environment in early childhood.
The divergence becomes more pronounced beginning at ages 6–11 years.
English-only households show a prevalence of 21.2%, compared with 17.2%
in non-English households and only 6.3% in bilingual households. By
adolescence (12–17 years), the gap widens further: English-only
households reach 28.4%, non-English households 22.4%, and bilingual
households 12.5%.
Three important patterns emerge:
First, the age slope is steepest in English-only households.
Prevalence triples from early childhood to adolescence. This suggests
either greater cumulative risk exposure, greater diagnostic
ascertainment, or both.
Second, bilingual households consistently show the lowest prevalence
at every age group, and the gap widens with age. This pattern is
consistent with a potential protective gradient associated with
bilingual environments, though unadjusted.
Third, non-English households fall between English-only and bilingual
households at older ages. However, note the wide confidence intervals in
the non-English adolescent group — this reflects the small sample size
(approximately 0.6% of the analytic sample), and those estimates should
be interpreted cautiously.
From a modeling perspective, this figure suggests two key
implications:
• Age must be included as a covariate in all regression models. •
There is visual evidence of possible age × language interaction
(non-parallel lines). The widening gap in adolescence suggests the
association between household language and mental health may strengthen
over developmental stages.
Substantively, the increasing gap with age is consistent with
literature showing that cultural protective factors (family cohesion,
collectivism, reduced externalizing behavior) may become more
influential during adolescence, while also reflecting differential
access to diagnosis across language environments (Alegría et al., 2010;
Marks et al., 2014).
Importantly, this is still an unadjusted figure. Income, parental
education, race/ethnicity, and caregiver mental health are unevenly
distributed across language groups. The apparent protective pattern
could attenuate or strengthen once those confounders are controlled in
Section 5 regression models.
In summary, the figure supports your core hypothesis visually:
children in bilingual and non-English households appear to have lower
crude prevalence of impairing conditions, and the difference becomes
more pronounced in adolescence. However, formal interaction testing in
logistic regression will be required to confirm whether this
age-dependent pattern is statistically significant after adjustment.
Figure 5 — Outcome by Language × Child Race
race_lang_prev <- map_dfr(
levels(nsch_clean$household_language), function(grp) {
map_dfr(levels(nsch_clean$child_race), function(race) {
sub_data <- nsch_clean %>%
filter(household_language == grp, child_race == race)
if (nrow(sub_data) < 30) return(NULL)
sub_design2 <- svydesign(
ids = ~HHID_F,
strata = ~STRATUM_F,
weights = ~weight_pooled,
data = sub_data,
nest = TRUE
)
est <- svymean(~impairing_mental_behav, sub_design2, na.rm = TRUE)
data.frame(
Language = grp,
Race = race,
Prevalence = as.numeric(coef(est)), # fix
SE = as.numeric(SE(est)), # fix
n = nrow(sub_data)
)
})
}
) %>%
mutate(
lower = pmax(Prevalence - 1.96 * SE, 0),
upper = Prevalence + 1.96 * SE,
Language = factor(Language, levels = c("English only", "Bilingual", "Non-English")),
Race = case_when(
Race == "White alone" ~ "White",
Race == "Black or African American alone" ~ "Black",
Race == "American Indian or Alaska Native alone" ~ "AIAN",
Race == "Asian alone" ~ "Asian",
Race == "Native Hawaiian / Other Pacific Islander alone" ~ "NHOPI",
Race == "Two or more races" ~ "Multiracial",
TRUE ~ Race
)
)
fig5 <- ggplot(race_lang_prev,
aes(x = Race, y = Prevalence, color = Language, group = Language)) +
geom_line(linewidth = 1.1) +
geom_point(aes(size = n), alpha = 0.85) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.25, linewidth = 0.6) +
scale_color_manual(values = lang_colors, name = "Household Language") +
scale_size_continuous(name = "Unweighted n", range = c(2, 7),
labels = comma_format()) +
scale_y_continuous(labels = percent_format(),
expand = expansion(mult = c(0.02, 0.1))) +
labs(
title = "Figure 5. Prevalence of Any Impairing Condition by Household Language and Child Race",
subtitle = "NSCH 2020–2022 (design-adjusted; cells with n < 30 excluded)",
x = "Child Race/Ethnicity",
y = "Weighted Prevalence",
caption = "Error bars = 95% CI. Point size proportional to unweighted cell n."
) +
theme_eda +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
fig5

ggsave("outputs/figures/Figure4_Outcome_by_Language_Race.png",
fig5, width = 10, height = 6, dpi = 300)
Interpretation: The language–outcome gradient is not uniform across
racial groups, which is the visual signature of effect modification.
White children in English-only households show among the highest
prevalences overall, while Asian children in non-English households show
the lowest, consistent with the immigrant paradox being strongest in
recently arrived, non-White immigrant communities. The three language
lines are non-parallel across racial categories — for example, the gap
between English-only and non-English households is wider for White and
multiracial children than for Black children — which motivates testing a
formal language × race interaction term in adjusted regression models.
Cells for American Indian/Alaska Native and Native Hawaiian/Pacific
Islander children should be interpreted cautiously given small
unweighted sample sizes, and may need to be collapsed or excluded in
formal models to ensure stable estimates.
Figure 6 — Outcome by Language × Household Income
income_lang_prev <- map_dfr(
levels(nsch_clean$household_language), function(grp) {
map_dfr(levels(nsch_clean$income_fpl), function(inc) {
sub_data <- nsch_clean %>%
filter(household_language == grp, income_fpl == inc)
if (nrow(sub_data) < 30) return(NULL)
sub_design3 <- svydesign(
ids = ~HHID_F,
strata = ~STRATUM_F,
weights = ~weight_pooled,
data = sub_data,
nest = TRUE
)
est <- svymean(~impairing_mental_behav, sub_design3, na.rm = TRUE)
data.frame(
Language = grp,
Income = inc,
Prevalence = as.numeric(coef(est)), # fix
SE = as.numeric(SE(est)), # fix
n = nrow(sub_data)
)
})
}
) %>%
mutate(
lower = pmax(Prevalence - 1.96 * SE, 0),
upper = Prevalence + 1.96 * SE,
Language = factor(Language, levels = c("English only", "Bilingual", "Non-English")),
Income = factor(Income, levels = c("0-99% FPL", "100-199% FPL",
"200-399% FPL", "400%+ FPL"))
)
fig6 <- ggplot(income_lang_prev,
aes(x = Income, y = Prevalence, fill = Language)) +
geom_col(position = position_dodge(0.85), width = 0.72, color = "white") +
geom_errorbar(aes(ymin = lower, ymax = upper),
position = position_dodge(0.85), width = 0.25, linewidth = 0.65) +
geom_text(aes(label = paste0(round(Prevalence * 100, 1), "%")),
position = position_dodge(0.85),
vjust = -0.7, size = 3, fontface = "bold") +
scale_fill_manual(values = lang_colors, name = "Household Language") +
scale_y_continuous(labels = percent_format(),
expand = expansion(mult = c(0, 0.18))) +
labs(
title = "Figure 6. Prevalence of Any Impairing Condition by Household Language and Household Income",
subtitle = "NSCH 2020–2022 (design-adjusted weighted estimates)",
x = "Household Income (% Federal Poverty Level)",
y = "Weighted Prevalence",
caption = "Error bars = 95% CI."
) +
theme_eda +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
fig6

ggsave("outputs/figures/Figure5_Outcome_by_Language_Income.png",
fig6, width = 10, height = 6, dpi = 300)
Interpretation: This figure was chosen to directly test whether the
crude language–outcome association is simply a proxy for socioeconomic
disadvantage, since non-English households are disproportionately
low-income. The language gradient persists within every income stratum:
even among the lowest-income children (0–99% FPL), those in non-English
households have lower diagnosed prevalences than English-only children
in the same income band. This within-stratum persistence suggests that
income alone does not fully explain the association, pointing instead to
cultural, linguistic, or acculturation-related mechanisms. Nonetheless,
income still acts as a confounder because it is unevenly distributed
across language groups, so it must be included in all adjusted
regression models.
Part 2 Regression Analysis
STEP 1 — Confirm Survey Design (Final Check Before Modeling)
# ---------------------------------------
# Define complete-case regression sample
# ---------------------------------------
analysis_complete <- nsch_clean %>%
filter(
!is.na(caregiver_mental_health),
!is.na(urban_rural),
!is.na(income_fpl),
!is.na(parent_education),
!is.na(family_structure),
!is.na(ace_burden_cat)
)
cat("Complete-case N for regression:", nrow(analysis_complete), "\n")
## Complete-case N for regression: 108713
This defines a single complete-case analytic sample before
regression
Building survey design
nsch_design_complete <- svydesign(
ids = ~HHID_F,
strata = ~STRATUM_F,
weights = ~weight_pooled,
data = nsch_clean,
nest = TRUE
)
options(survey.lonely.psu = "adjust")
This ensures:
STEP 2 — Unadjusted Logistic Regression
Before controlling for anything, is household language associated
with impairing condition?
model_crude <- svyglm(
impairing_mental_behav ~ household_language,
design = nsch_design_complete,
family = quasibinomial()
)
summary(model_crude)
##
## Call:
## svyglm(formula = impairing_mental_behav ~ household_language,
## design = nsch_design_complete, family = quasibinomial())
##
## Survey design:
## svydesign(ids = ~HHID_F, strata = ~STRATUM_F, weights = ~weight_pooled,
## data = nsch_clean, nest = TRUE)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.31643 0.01385 -95.038 < 0.0000000000000002
## household_languageBilingual -0.99562 0.08765 -11.359 < 0.0000000000000002
## household_languageNon-English -0.46733 0.15934 -2.933 0.00336
##
## (Intercept) ***
## household_languageBilingual ***
## household_languageNon-English **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.000008)
##
## Number of Fisher Scoring iterations: 4
# Convert to odds ratios:
crude_results <- broom::tidy(model_crude) %>%
mutate(
OR = exp(estimate),
LCL = exp(estimate - 1.96 * std.error),
UCL = exp(estimate + 1.96 * std.error)
)
print(crude_results, width = Inf)
## # A tibble: 3 × 8
## term estimate std.error statistic p.value OR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.32 0.0139 -95.0 0 0.268
## 2 household_languageBilingual -0.996 0.0877 -11.4 6.93e-30 0.369
## 3 household_languageNon-English -0.467 0.159 -2.93 3.36e- 3 0.627
## LCL UCL
## <dbl> <dbl>
## 1 0.261 0.275
## 2 0.311 0.439
## 3 0.459 0.856
Interpret the crude model results: Reference category is English-only
households. Coefficients (log-odds scale):
Crude (Unadjusted) Results
Compared to English-only households, both language minority groups
showed significantly lower odds of any impairing mental or behavioral
condition before accounting for any covariates.
Bilingual households had 63% lower odds of an impairing condition (OR
= 0.369, 95% CI: 0.311–0.439, p < .001). This is a large and highly
significant effect even in the unadjusted model, and the confidence
interval is relatively tight, suggesting a stable estimate.
Non-English only households had 37% lower odds compared to
English-only households (OR = 0.627, 95% CI: 0.459–0.856, p = .003).
This association is also significant but the confidence interval is
wider, reflecting the smaller sample size in that group and more
uncertainty around the estimate. The intercept (OR = 0.268) represents
the baseline odds of an impairing condition for English-only households
and is not something you would typically report or interpret directly in
a paper.
The key story at this stage is that both language minority groups
appear protective relative to English-only households in the unadjusted
analysis, but as you already know from the adjusted model, the
non-English only effect does not survive adjustment for sociodemographic
covariates while the bilingual effect remains strong and actually grows
slightly. That contrast between the two groups is the heart of your
findin
STEP 3 — Fully Adjusted Main Effects Model
model_adj <- svyglm(
impairing_mental_behav ~ household_language +
child_age + child_sex + child_race +
income_fpl + parent_education +
family_structure + has_insurance +
child_nativity + urban_rural + ace_burden_cat +
survey_year,
design = nsch_design_complete,
family = quasibinomial()
)
summary(model_adj)
##
## Call:
## svyglm(formula = impairing_mental_behav ~ household_language +
## child_age + child_sex + child_race + income_fpl + parent_education +
## family_structure + has_insurance + child_nativity + urban_rural +
## ace_burden_cat + survey_year, design = nsch_design_complete,
## family = quasibinomial())
##
## Survey design:
## svydesign(ids = ~HHID_F, strata = ~STRATUM_F, weights = ~weight_pooled,
## data = nsch_clean, nest = TRUE)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2.393943 0.107421
## household_languageBilingual -0.967033 0.105931
## household_languageNon-English -0.035915 0.181730
## child_age 0.088717 0.003593
## child_sexFemale -0.357459 0.032281
## child_raceBlack or African American alone -0.296272 0.053768
## child_raceAmerican Indian or Alaska Native alone -0.343839 0.123900
## child_raceAsian alone -0.771184 0.091657
## child_raceNative Hawaiian / Other Pacific Islander alone -0.443445 0.264486
## child_raceTwo or more races 0.001666 0.057381
## income_fpl100-199% FPL -0.072757 0.058277
## income_fpl200-399% FPL -0.216216 0.055455
## income_fpl400%+ FPL -0.211976 0.056548
## parent_educationHigh school 0.060391 0.092639
## parent_educationSome college 0.151669 0.091648
## parent_educationBachelor's+ 0.140600 0.092414
## family_structureSingle parent -0.118660 0.045414
## family_structureOther 0.159735 0.077350
## has_insuranceUninsured -0.407917 0.087302
## child_nativityForeign-born -0.178300 0.108271
## urban_ruralRural -0.109776 0.042422
## ace_burden_cat1 ACE 0.547107 0.043976
## ace_burden_cat2 ACEs 0.913545 0.057430
## ace_burden_cat3+ ACEs 1.538249 0.060390
## survey_year2021 0.059151 0.041936
## survey_year2022 0.146671 0.039837
## t value
## (Intercept) -22.286
## household_languageBilingual -9.129
## household_languageNon-English -0.198
## child_age 24.689
## child_sexFemale -11.073
## child_raceBlack or African American alone -5.510
## child_raceAmerican Indian or Alaska Native alone -2.775
## child_raceAsian alone -8.414
## child_raceNative Hawaiian / Other Pacific Islander alone -1.677
## child_raceTwo or more races 0.029
## income_fpl100-199% FPL -1.248
## income_fpl200-399% FPL -3.899
## income_fpl400%+ FPL -3.749
## parent_educationHigh school 0.652
## parent_educationSome college 1.655
## parent_educationBachelor's+ 1.521
## family_structureSingle parent -2.613
## family_structureOther 2.065
## has_insuranceUninsured -4.672
## child_nativityForeign-born -1.647
## urban_ruralRural -2.588
## ace_burden_cat1 ACE 12.441
## ace_burden_cat2 ACEs 15.907
## ace_burden_cat3+ ACEs 25.472
## survey_year2021 1.411
## survey_year2022 3.682
## Pr(>|t|)
## (Intercept) < 0.0000000000000002
## household_languageBilingual < 0.0000000000000002
## household_languageNon-English 0.843336
## child_age < 0.0000000000000002
## child_sexFemale < 0.0000000000000002
## child_raceBlack or African American alone 0.0000000359
## child_raceAmerican Indian or Alaska Native alone 0.005519
## child_raceAsian alone < 0.0000000000000002
## child_raceNative Hawaiian / Other Pacific Islander alone 0.093618
## child_raceTwo or more races 0.976842
## income_fpl100-199% FPL 0.211867
## income_fpl200-399% FPL 0.0000966633
## income_fpl400%+ FPL 0.000178
## parent_educationHigh school 0.514471
## parent_educationSome college 0.097945
## parent_educationBachelor's+ 0.128158
## family_structureSingle parent 0.008981
## family_structureOther 0.038915
## has_insuranceUninsured 0.0000029797
## child_nativityForeign-born 0.099604
## urban_ruralRural 0.009663
## ace_burden_cat1 ACE < 0.0000000000000002
## ace_burden_cat2 ACEs < 0.0000000000000002
## ace_burden_cat3+ ACEs < 0.0000000000000002
## survey_year2021 0.158391
## survey_year2022 0.000232
##
## (Intercept) ***
## household_languageBilingual ***
## household_languageNon-English
## child_age ***
## child_sexFemale ***
## child_raceBlack or African American alone ***
## child_raceAmerican Indian or Alaska Native alone **
## child_raceAsian alone ***
## child_raceNative Hawaiian / Other Pacific Islander alone .
## child_raceTwo or more races
## income_fpl100-199% FPL
## income_fpl200-399% FPL ***
## income_fpl400%+ FPL ***
## parent_educationHigh school
## parent_educationSome college .
## parent_educationBachelor's+
## family_structureSingle parent **
## family_structureOther *
## has_insuranceUninsured ***
## child_nativityForeign-born .
## urban_ruralRural **
## ace_burden_cat1 ACE ***
## ace_burden_cat2 ACEs ***
## ace_burden_cat3+ ACEs ***
## survey_year2021
## survey_year2022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.066082)
##
## Number of Fisher Scoring iterations: 5
ace_burden_cat (categorical) was used because I
wasnted to allow non-linear effects and report ORs for each burden level
vs. 0 ACEs.
Primary Exposure: Household Language Reference =
English-only households
Bilingual households: OR = exp(−0.967) = 0.38 —
after adjusting for all covariates including ACE burden, bilingual
households have 62% lower odds of the child having an impairing
mental/behavioral condition. Highly significant (p < 0.001).
Non-English households: OR = exp(−0.036) = 0.96 —
essentially no difference from English-only, and non-significant (p =
0.84). This is a notable shift from the crude model where Non-English
was significantly protective, suggesting confounders (likely ACE burden,
income, education) were masking the true relationship.
ACE Burden (Primary Covariate of Interest) A clear,
strong, dose-response gradient — every category is highly significant
(all p < 0.001
#Convert to odds ratios:
Key Covariates Child characteristics:
Age: Each additional year increases odds by exp(0.089) = 1.09 (9% per
year), p < 0.001 Sex: Females have 30% lower odds than males (OR =
0.70), p < 0.001 Race: Black (OR = 0.74), AIAN (OR = 0.71), and Asian
(OR = 0.46) children all have significantly lower odds than White
children
Socioeconomic:
Income: 200–399% FPL (OR = 0.81) and 400%+ FPL (OR = 0.81) both
significantly lower odds vs. 0–99% FPL — poverty is a risk factor Parent
education: No significant differences after adjustment — absorbed by ACE
and income Uninsured: OR = 0.67, significantly lower odds — likely
reflects underdiagnosis rather than true protection
Family/structural:
Single parent: OR = 0.89, modestly lower odds vs. two-parent (p =
0.009) Other family structure: OR = 1.17, slightly higher odds (p =
0.039) Rural: OR = 0.90, lower odds than urban (p = 0.010) — again
likely reflects access/diagnosis gaps
adj_results <- broom::tidy(model_adj) %>%
mutate(
OR = exp(estimate),
LCL = exp(estimate - 1.96 * std.error),
UCL = exp(estimate + 1.96 * std.error)
)
adj_results
## # A tibble: 26 × 8
## term estimate std.error statistic p.value OR LCL UCL
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.39 0.107 -22.3 9.00e-110 0.0913 0.0739 0.113
## 2 household_languag… -0.967 0.106 -9.13 7.04e- 20 0.380 0.309 0.468
## 3 household_languag… -0.0359 0.182 -0.198 8.43e- 1 0.965 0.676 1.38
## 4 child_age 0.0887 0.00359 24.7 3.30e-134 1.09 1.09 1.10
## 5 child_sexFemale -0.357 0.0323 -11.1 1.75e- 28 0.699 0.657 0.745
## 6 child_raceBlack o… -0.296 0.0538 -5.51 3.59e- 8 0.744 0.669 0.826
## 7 child_raceAmerica… -0.344 0.124 -2.78 5.52e- 3 0.709 0.556 0.904
## 8 child_raceAsian a… -0.771 0.0917 -8.41 4.02e- 17 0.462 0.386 0.553
## 9 child_raceNative … -0.443 0.264 -1.68 9.36e- 2 0.642 0.382 1.08
## 10 child_raceTwo or … 0.00167 0.0574 0.0290 9.77e- 1 1.00 0.895 1.12
## # ℹ 16 more rows
# converting from log-odds coefficient to odds ratios
adj_results <- tidy(model_adj, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~round(., 3)))
print(adj_results, n = Inf, width = Inf)
## # A tibble: 26 × 7
## term estimate std.error
## <chr> <dbl> <dbl>
## 1 (Intercept) 0.091 0.107
## 2 household_languageBilingual 0.38 0.106
## 3 household_languageNon-English 0.965 0.182
## 4 child_age 1.09 0.004
## 5 child_sexFemale 0.699 0.032
## 6 child_raceBlack or African American alone 0.744 0.054
## 7 child_raceAmerican Indian or Alaska Native alone 0.709 0.124
## 8 child_raceAsian alone 0.462 0.092
## 9 child_raceNative Hawaiian / Other Pacific Islander alone 0.642 0.264
## 10 child_raceTwo or more races 1.00 0.057
## 11 income_fpl100-199% FPL 0.93 0.058
## 12 income_fpl200-399% FPL 0.806 0.055
## 13 income_fpl400%+ FPL 0.809 0.057
## 14 parent_educationHigh school 1.06 0.093
## 15 parent_educationSome college 1.16 0.092
## 16 parent_educationBachelor's+ 1.15 0.092
## 17 family_structureSingle parent 0.888 0.045
## 18 family_structureOther 1.17 0.077
## 19 has_insuranceUninsured 0.665 0.087
## 20 child_nativityForeign-born 0.837 0.108
## 21 urban_ruralRural 0.896 0.042
## 22 ace_burden_cat1 ACE 1.73 0.044
## 23 ace_burden_cat2 ACEs 2.49 0.057
## 24 ace_burden_cat3+ ACEs 4.66 0.06
## 25 survey_year2021 1.06 0.042
## 26 survey_year2022 1.16 0.04
## statistic p.value conf.low conf.high
## <dbl> <dbl> <dbl> <dbl>
## 1 -22.3 0 0.074 0.113
## 2 -9.13 0 0.309 0.468
## 3 -0.198 0.843 0.676 1.38
## 4 24.7 0 1.08 1.1
## 5 -11.1 0 0.657 0.745
## 6 -5.51 0 0.669 0.826
## 7 -2.78 0.006 0.556 0.904
## 8 -8.41 0 0.386 0.553
## 9 -1.68 0.094 0.382 1.08
## 10 0.029 0.977 0.895 1.12
## 11 -1.25 0.212 0.829 1.04
## 12 -3.90 0 0.723 0.898
## 13 -3.75 0 0.724 0.904
## 14 0.652 0.514 0.886 1.27
## 15 1.66 0.098 0.972 1.39
## 16 1.52 0.128 0.96 1.38
## 17 -2.61 0.009 0.812 0.971
## 18 2.06 0.039 1.01 1.36
## 19 -4.67 0 0.56 0.789
## 20 -1.65 0.1 0.677 1.03
## 21 -2.59 0.01 0.825 0.974
## 22 12.4 0 1.59 1.88
## 23 15.9 0 2.23 2.79
## 24 25.5 0 4.14 5.24
## 25 1.41 0.158 0.977 1.15
## 26 3.68 0 1.07 1.25
These results were without ACE Household Language (Primary
Exposure)
After adjusting for all covariates, bilingual households still show
significantly lower odds of any impairing condition compared to
English-only households (β = −1.10, p < .001), converting to roughly
exp(−1.10) ≈ 0.33, about 67% lower odds. The non-English only group,
however, is no longer significant (β = −0.25, p = .196), meaning once
you account for sociodemographic factors, that group’s apparent
protection disappears.
Bilingual households have an OR of 0.334, meaning about 67% lower
odds of an impairing condition compared to English-only households, and
that is highly significant. Non-English only comes in at 0.779 and is
not significant (p = .196), confirming what we discussed earlier about
confounders explaining that association.
Child Characteristics
Older children have higher odds of diagnosis, which makes sense given
cumulative detection over time (β = 0.10, p < .001). Girls have
significantly lower odds than boys (β = −0.34), consistent with the
literature on male-predominant conditions like ADHD and autism. For
race, Black, Asian, and Native Hawaiian/Pacific Islander children all
show significantly lower odds compared to the White reference group,
with Asian children showing the largest gap (β = −0.92).
Each additional year of age increases odds by about 11% (OR = 1.11).
Girls have 29% lower odds than boys (OR = 0.713). Asian children show
the largest racial gap with 60% lower odds than the White reference
group (OR = 0.398), and Black children show about 28% lower odds (OR =
0.719).
Socioeconomic Factors
Higher income was protective. Compared to families below 100% FPL,
those at 200–399% FPL had 24% lower odds (OR = 0.762, 95% CI:
0.684–0.848, p < .001) and those at 400%+ FPL had 31% lower odds (OR
= 0.690, 95% CI: 0.619–0.770, p < .001). The 100–199% FPL group was
not significantly different from the lowest income group (OR = 0.938,
95% CI: 0.837–1.05, p = .269), suggesting the protective income gradient
really begins above 200% FPL. For parental education, only the some
college group differed significantly from the less than high school
reference, with 22% higher odds (OR = 1.22, 95% CI: 1.02–1.46, p =
.032). High school and bachelor’s and above were not significant, which
is a somewhat unusual pattern worth acknowledging.
Family Structure This was one of the stronger
predictors in the model. Single parent households had 41% higher odds
compared to two parent households (OR = 1.41, 95% CI: 1.31–1.52, p <
.001). The other family structure category, which likely includes
grandparent-headed or other non-traditional arrangements, showed more
than double the odds (OR = 2.11, 95% CI: 1.84–2.42, p < .001).
Insurance, Nativity, and Urbanicity Uninsured
children had 37% lower odds of a diagnosed impairing condition (OR =
0.634, 95% CI: 0.531–0.757, p < .001). This almost certainly reflects
reduced healthcare access and underdiagnosis rather than true lower
burden, and is worth flagging as a limitation. Foreign-born children and
rural residence were both non-significant after adjustment. Survey year
2022 showed modestly higher odds compared to 2020 (OR = 1.15, 95% CI:
1.07–1.25, p < .001), possibly reflecting pandemic-related diagnostic
trends or survey changes across years.
Meaning
In the crude model, the bilingual coefficient was
−0.996. In the adjusted model, it is −1.097. That is a slightly larger
negative number, meaning the estimated protective effect got a little
bigger after adding all the covariates, not smaller.
Compare that to the non-English group, where the coefficient went
from −0.467 (significant) in the crude model to −0.249 (not significant)
in the adjusted model. That one did attenuate, meaning confounders were
partly explaining that association. The bilingual group behaved
differently, and that contrast is worth highlighting in your
results.
STEP 4 — Test Effect Modification (Interaction Models)
Three moderators:
• Urbanicity • Caregiver mental health • ACE burden
#4A — Language × Urbanicity
# 4A — Language × Urbanicity
model_int_urban <- svyglm(
impairing_mental_behav ~ household_language * urban_rural +
child_age + child_sex + child_race +
income_fpl + parent_education + ace_burden_cat +
family_structure + has_insurance +
child_nativity + survey_year,
design = nsch_design_complete,
family = quasibinomial()
)
regTermTest(model_int_urban, ~ household_language:urban_rural)
## Wald test for household_language:urban_rural
## in svyglm(formula = impairing_mental_behav ~ household_language *
## urban_rural + child_age + child_sex + child_race + income_fpl +
## parent_education + ace_burden_cat + family_structure + has_insurance +
## child_nativity + survey_year, design = nsch_design_complete,
## family = quasibinomial())
## F = 0.2513849 on 2 and 108687 df: p= 0.77772
The interaction between household language and urban/rural residence
is not significant (F = 0.21, p = .809). This means the relationship
between household language and impairing conditions does not
meaningfully differ between urban and rural settings. The language
effect is essentially the same regardless of where the family lives, so
you do not need to retain this interaction term in your model.
#4B — Language × Caregiver Mental Health
model_int_caregiver <- svyglm(
impairing_mental_behav ~ household_language * caregiver_mental_health +
child_age + child_sex + child_race +
income_fpl + parent_education + ace_burden_cat +
family_structure + has_insurance +
child_nativity + urban_rural + survey_year,
design = nsch_design_complete,
family = quasibinomial()
)
regTermTest(model_int_caregiver, ~ household_language:caregiver_mental_health)
## Wald test for household_language:caregiver_mental_health
## in svyglm(formula = impairing_mental_behav ~ household_language *
## caregiver_mental_health + child_age + child_sex + child_race +
## income_fpl + parent_education + ace_burden_cat + family_structure +
## has_insurance + child_nativity + urban_rural + survey_year,
## design = nsch_design_complete, family = quasibinomial())
## F = 1.19102 on 2 and 107999 df: p= 0.30392
Another clean null result — F = 1.19, df = 2, p = 0.304.
Interpretation The association between household language and child
mental/behavioral conditions does not differ by caregiver mental health
status. The bilingual protective effect is roughly the same regardless
of whether the caregiver has good or poor mental health. This is
actually theoretically interesting, you might have expected that poor
caregiver mental health would erode the bilingual protective effect
(stressed caregivers less able to maintain cultural/linguistic
protective practices), but the data don’t support that.
#4C — Language × ACE Burden
model_int_ace <- svyglm(
impairing_mental_behav ~ household_language * ace_burden_cat +
child_age + child_sex + child_race +
income_fpl + parent_education +
family_structure + has_insurance +
child_nativity + urban_rural + survey_year,
design = nsch_design_complete,
family = quasibinomial()
)
regTermTest(model_int_ace, ~ household_language:ace_burden_cat)
## Wald test for household_language:ace_burden_cat
## in svyglm(formula = impairing_mental_behav ~ household_language *
## ace_burden_cat + child_age + child_sex + child_race + income_fpl +
## parent_education + family_structure + has_insurance + child_nativity +
## urban_rural + survey_year, design = nsch_design_complete,
## family = quasibinomial())
## F = 0.3699278 on 6 and 108683 df: p= 0.89844
Interpretation The protective effect of bilingual/non-English
household language on child mental/behavioral conditions does not vary
across ACE burden levels. A bilingual family with 3+ ACEs has roughly
the same language-associated protection as a bilingual family with 0
ACEs. ACE burden and household language operate as independent, additive
risk/protective factors rather than interacting ones. This is actually a
meaningful finding, it suggests the bilingual protective effect is
robust across adversity levels, not contingent on having a low-stress
household environment.
#STEP 5 — Stratified Models (If Interaction Significant)
model_urban <- svyglm(
impairing_mental_behav ~ household_language +
child_age + child_sex + child_race +
income_fpl + parent_education + ace_burden_cat +
family_structure + has_insurance +
child_nativity + survey_year,
design = subset(nsch_design_complete, urban_rural == "Urban"),
family = quasibinomial()
)
model_urban
## Stratified Independent Sampling design (with replacement)
## subset(nsch_design_complete, urban_rural == "Urban")
##
## Call: svyglm(formula = impairing_mental_behav ~ household_language +
## child_age + child_sex + child_race + income_fpl + parent_education +
## ace_burden_cat + family_structure + has_insurance + child_nativity +
## survey_year, design = subset(nsch_design_complete, urban_rural ==
## "Urban"), family = quasibinomial())
##
## Coefficients:
## (Intercept)
## -2.47333
## household_languageBilingual
## -0.96418
## household_languageNon-English
## -0.00369
## child_age
## 0.08952
## child_sexFemale
## -0.33062
## child_raceBlack or African American alone
## -0.32598
## child_raceAmerican Indian or Alaska Native alone
## -0.45690
## child_raceAsian alone
## -0.77526
## child_raceNative Hawaiian / Other Pacific Islander alone
## -0.50117
## child_raceTwo or more races
## -0.04246
## income_fpl100-199% FPL
## -0.03190
## income_fpl200-399% FPL
## -0.20716
## income_fpl400%+ FPL
## -0.17830
## parent_educationHigh school
## 0.12109
## parent_educationSome college
## 0.19462
## parent_educationBachelor's+
## 0.19436
## ace_burden_cat1 ACE
## 0.56558
## ace_burden_cat2 ACEs
## 0.90463
## ace_burden_cat3+ ACEs
## 1.55874
## family_structureSingle parent
## -0.12762
## family_structureOther
## 0.16265
## has_insuranceUninsured
## -0.39013
## child_nativityForeign-born
## -0.18485
## survey_year2021
## 0.05060
## survey_year2022
## 0.13594
##
## Degrees of Freedom: 89068 Total (i.e. Null); 89043 Residual
## (3267 observations deleted due to missingness)
## Null Deviance: 87090
## Residual Deviance: 78520 AIC: NA
model_rural <- svyglm(
impairing_mental_behav ~ household_language +
child_age + child_sex + child_race +
income_fpl + parent_education + ace_burden_cat +
family_structure + has_insurance +
child_nativity + survey_year,
design = subset(nsch_design_complete, urban_rural == "Rural"),
family = quasibinomial()
)
model_rural
## Stratified Independent Sampling design (with replacement)
## subset(nsch_design_complete, urban_rural == "Rural")
##
## Call: svyglm(formula = impairing_mental_behav ~ household_language +
## child_age + child_sex + child_race + income_fpl + parent_education +
## ace_burden_cat + family_structure + has_insurance + child_nativity +
## survey_year, design = subset(nsch_design_complete, urban_rural ==
## "Rural"), family = quasibinomial())
##
## Coefficients:
## (Intercept)
## -2.180922
## household_languageBilingual
## -0.863381
## household_languageNon-English
## -0.174962
## child_age
## 0.085068
## child_sexFemale
## -0.534767
## child_raceBlack or African American alone
## -0.042386
## child_raceAmerican Indian or Alaska Native alone
## 0.046324
## child_raceAsian alone
## -1.030501
## child_raceNative Hawaiian / Other Pacific Islander alone
## 0.207662
## child_raceTwo or more races
## 0.366579
## income_fpl100-199% FPL
## -0.244440
## income_fpl200-399% FPL
## -0.218806
## income_fpl400%+ FPL
## -0.473180
## parent_educationHigh school
## -0.194453
## parent_educationSome college
## -0.007770
## parent_educationBachelor's+
## -0.107434
## ace_burden_cat1 ACE
## 0.426246
## ace_burden_cat2 ACEs
## 0.972062
## ace_burden_cat3+ ACEs
## 1.441448
## family_structureSingle parent
## -0.066445
## family_structureOther
## 0.158432
## has_insuranceUninsured
## -0.512563
## child_nativityForeign-born
## -0.006018
## survey_year2021
## 0.131979
## survey_year2022
## 0.222544
##
## Degrees of Freedom: 19646 Total (i.e. Null); 19621 Residual
## (697 observations deleted due to missingness)
## Null Deviance: 20170
## Residual Deviance: 18010 AIC: NA
Urban and Rural Odds Ratios (OR)
# Urban ORs
urban_results <- tidy(model_urban) %>%
mutate(
OR = exp(estimate),
LCL = exp(estimate - 1.96 * std.error),
UCL = exp(estimate + 1.96 * std.error),
stratum = "Urban"
)
# Rural ORs
rural_results <- tidy(model_rural) %>%
mutate(
OR = exp(estimate),
LCL = exp(estimate - 1.96 * std.error),
UCL = exp(estimate + 1.96 * std.error),
stratum = "Rural"
)
# Combined
stratified_results <- bind_rows(urban_results, rural_results) %>%
select(stratum, term, OR, LCL, UCL, p.value) %>%
filter(grepl("household_language|ace_burden", term)) %>% # focus on key terms
mutate(across(c(OR, LCL, UCL), ~round(., 2)),
p.value = round(p.value, 3))
print(stratified_results, n = Inf)
## # A tibble: 10 × 6
## stratum term OR LCL UCL p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Urban household_languageBilingual 0.38 0.31 0.47 0
## 2 Urban household_languageNon-English 1 0.69 1.44 0.984
## 3 Urban ace_burden_cat1 ACE 1.76 1.6 1.93 0
## 4 Urban ace_burden_cat2 ACEs 2.47 2.19 2.79 0
## 5 Urban ace_burden_cat3+ ACEs 4.75 4.16 5.43 0
## 6 Rural household_languageBilingual 0.42 0.21 0.85 0.015
## 7 Rural household_languageNon-English 0.84 0.21 3.37 0.805
## 8 Rural ace_burden_cat1 ACE 1.53 1.25 1.88 0
## 9 Rural ace_burden_cat2 ACEs 2.64 2 3.5 0
## 10 Rural ace_burden_cat3+ ACEs 4.23 3.3 5.41 0
Household Language Effects by Stratum Bilingual vs. English-only:
Urban: OR = 0.38 (95% CI: 0.31–0.47), p < 0.001; highly precise,
very strong protective effect Rural: OR = 0.42 (95% CI: 0.21–0.85), p =
0.015; same direction, significant, but much wider CI reflecting the
small rural bilingual sample
The point estimates are nearly identical (0.38 vs. 0.42) and the CIs
overlap substantially — this is exactly why the interaction test was
null. The bilingual protective effect holds in both settings.
Non-English vs. English-only:
Urban: OR = 1.00 (95% CI: 0.69–1.44), p = 0.984; completely null
Rural: OR = 0.84 (95% CI: 0.21–3.37), p = 0.805;null, and the CI spans
0.21 to 3.37, which is essentially uninformative
The rural Non-English CI width (3.16 units wide) signals serious
sample size limitations for that cell. Any interpretation there should
be treated with extreme caution.
Stratified analyses revealed that the protective association of
bilingual household language was consistent across urban (OR = 0.38, 95%
CI: 0.31–0.47) and rural settings (OR = 0.42, 95% CI: 0.21–0.85), with
overlapping confidence intervals supporting the null interaction finding
(p = 0.78). The Non-English household effect was null in both strata,
though rural estimates were imprecise due to small cell sizes. ACE
burden demonstrated a consistent dose-response gradient in both urban
and rural settings, with children in households reporting 3+ ACEs facing
approximately 4–5 times the odds of impairing mental/behavioral
conditions compared to those with no ACEs.
STEP 6 — Multicollinearity Check
# Note: VIF uses unweighted glm as car::vif does not support svyglm objects.
# VIF assesses design matrix collinearity only — survey weights do not affect this.
model_check <- glm(
impairing_mental_behav ~ household_language +
child_age + child_sex + child_race +
income_fpl + parent_education +
family_structure + has_insurance +
child_nativity + urban_rural +
ace_burden_cat +
survey_year,
data = nsch_clean,
family = binomial()
)
car::vif(model_check)
## GVIF Df GVIF^(1/(2*Df))
## household_language 1.190567 2 1.044572
## child_age 1.053094 1 1.026204
## child_sex 1.001753 1 1.000876
## child_race 1.177561 5 1.016479
## income_fpl 1.475979 3 1.067038
## parent_education 1.423462 3 1.060615
## family_structure 1.567192 2 1.118872
## has_insurance 1.021808 1 1.010845
## child_nativity 1.110033 1 1.053581
## urban_rural 1.051599 1 1.025475
## ace_burden_cat 1.541621 3 1.074805
## survey_year 1.010097 2 1.002515
VIF > 5 → concern VIF > 10 → serious problem
Generalized variance inflation factors (GVIF) were examined for all
model covariates using an unweighted logistic regression on the analytic
sample. All GVIF^(1/(2·Df)) values were below 1.12, indicating no
evidence of multicollinearity.
The highest value is family_structure at 1.12, which is negligible.
Even variables you might expect to be correlated, income_fpl,
parent_education, has_insurance, and ace_burden_cat, show virtually no
inflation, confirming they are each contributing independent information
to the model.
What We Do Next
You now:
Run crude model
Run adjusted model
Post the OR table
Then I will:
Help you interpret the ORs clearly
Write Section 5 in journal-ready language
Help you determine whether your immigrant paradox hypothesis is
supported
This is where your paper becomes analytical rather than
descriptive.
Part 3: Results Tables
Converting the crude and adjusted models into clean OR tables with
95% CIs and p-values
# --- Crude model table ---
tbl_crude <- tbl_regression(
model_crude,
exponentiate = TRUE,
label = list(
household_language ~ "Household language"
)
) %>%
modify_header(estimate = "**OR**") %>%
add_significance_stars(
hide_ci = FALSE,
hide_p = FALSE
)
# --- Adjusted model table ---
tbl_adj <- tbl_regression(
model_adj,
exponentiate = TRUE,
label = list(
household_language ~ "Household language",
child_age ~ "Child age (years)",
child_sex ~ "Child sex",
child_race ~ "Child race",
income_fpl ~ "Household income (% FPL)",
parent_education ~ "Primary caregiver education",
family_structure ~ "Family structure",
has_insurance ~ "Health insurance status",
child_nativity ~ "Child nativity",
urban_rural ~ "Urbanicity",
ace_burden_cat ~ "ACE burden",
survey_year ~ "Survey year"
)
) %>%
modify_header(estimate = "**aOR**") %>%
add_significance_stars(
hide_ci = FALSE,
hide_p = FALSE
)
# --- Merge crude and adjusted side by side ---
tbl_merged <- tbl_merge(
tbls = list(tbl_crude, tbl_adj),
tab_spanner = c("**Crude**", "**Adjusted**"),
quiet = TRUE
) %>%
modify_caption(
"**Table 2. Crude and Adjusted Odds Ratios for Any Impairing Mental/Behavioral Condition**"
)
# --- Render and save ---
tbl_merged %>%
as_gt() %>%
gt::tab_source_note(
source_note = gt::md(
"OR = odds ratio; aOR = adjusted odds ratio; CI = 95% confidence interval. Models estimated using survey-weighted quasibinomial regression accounting for NSCH complex sampling design.
Reference categories: household language = English only; child sex = Male; child race = White alone; income = 0–99% FPL; parent education = Less than HS; family structure = Two parents; insurance = Insured; nativity = US-born; urbanicity = Urban; ACE burden = 0 ACEs; survey year = 2020.
*p<0.05; **p<0.01; ***p<0.001"
)
) %>%
gt::tab_options(
table.font.size = 12,
heading.title.font.size = 14,
data_row.padding = gt::px(4)
) %>%
gt::gtsave(file = file.path(project_folder, "outputs/tables/table2_crude_adjusted.html"))
tbl_merged
Table 2. Crude and Adjusted Odds Ratios for Any Impairing Mental/Behavioral Condition
| Characteristic |
Crude
|
Adjusted
|
| OR |
SE |
95% CI |
p-value |
aOR |
SE |
95% CI |
p-value |
| Household language |
|
|
|
|
|
|
|
|
| English only |
— |
— |
— |
|
— |
— |
— |
|
| Bilingual |
0.37*** |
0.088 |
0.31, 0.44 |
<0.001 |
0.38*** |
0.106 |
0.31, 0.47 |
<0.001 |
| Non-English |
0.63** |
0.159 |
0.46, 0.86 |
0.003 |
0.96 |
0.182 |
0.68, 1.38 |
0.8 |
| Child age (years) |
|
|
|
|
1.09*** |
0.004 |
1.09, 1.10 |
<0.001 |
| Child sex |
|
|
|
|
|
|
|
|
| Male |
|
|
|
|
— |
— |
— |
|
| Female |
|
|
|
|
0.70*** |
0.032 |
0.66, 0.75 |
<0.001 |
| Child race |
|
|
|
|
|
|
|
|
| White alone |
|
|
|
|
— |
— |
— |
|
| Black or African American alone |
|
|
|
|
0.74*** |
0.054 |
0.67, 0.83 |
<0.001 |
| American Indian or Alaska Native alone |
|
|
|
|
0.71** |
0.124 |
0.56, 0.90 |
0.006 |
| Asian alone |
|
|
|
|
0.46*** |
0.092 |
0.39, 0.55 |
<0.001 |
| Native Hawaiian / Other Pacific Islander alone |
|
|
|
|
0.64 |
0.264 |
0.38, 1.08 |
0.094 |
| Two or more races |
|
|
|
|
1.00 |
0.057 |
0.90, 1.12 |
>0.9 |
| Household income (% FPL) |
|
|
|
|
|
|
|
|
| 0-99% FPL |
|
|
|
|
— |
— |
— |
|
| 100-199% FPL |
|
|
|
|
0.93 |
0.058 |
0.83, 1.04 |
0.2 |
| 200-399% FPL |
|
|
|
|
0.81*** |
0.055 |
0.72, 0.90 |
<0.001 |
| 400%+ FPL |
|
|
|
|
0.81*** |
0.057 |
0.72, 0.90 |
<0.001 |
| Primary caregiver education |
|
|
|
|
|
|
|
|
| Less than HS |
|
|
|
|
— |
— |
— |
|
| High school |
|
|
|
|
1.06 |
0.093 |
0.89, 1.27 |
0.5 |
| Some college |
|
|
|
|
1.16 |
0.092 |
0.97, 1.39 |
0.10 |
| Bachelor's+ |
|
|
|
|
1.15 |
0.092 |
0.96, 1.38 |
0.13 |
| Family structure |
|
|
|
|
|
|
|
|
| Two parents |
|
|
|
|
— |
— |
— |
|
| Single parent |
|
|
|
|
0.89** |
0.045 |
0.81, 0.97 |
0.009 |
| Other |
|
|
|
|
1.17* |
0.077 |
1.01, 1.37 |
0.039 |
| Health insurance status |
|
|
|
|
|
|
|
|
| Insured |
|
|
|
|
— |
— |
— |
|
| Uninsured |
|
|
|
|
0.67*** |
0.087 |
0.56, 0.79 |
<0.001 |
| Child nativity |
|
|
|
|
|
|
|
|
| US-born |
|
|
|
|
— |
— |
— |
|
| Foreign-born |
|
|
|
|
0.84 |
0.108 |
0.68, 1.03 |
0.10 |
| Urbanicity |
|
|
|
|
|
|
|
|
| Urban |
|
|
|
|
— |
— |
— |
|
| Rural |
|
|
|
|
0.90** |
0.042 |
0.82, 0.97 |
0.010 |
| ACE burden |
|
|
|
|
|
|
|
|
| 0 ACEs |
|
|
|
|
— |
— |
— |
|
| 1 ACE |
|
|
|
|
1.73*** |
0.044 |
1.59, 1.88 |
<0.001 |
| 2 ACEs |
|
|
|
|
2.49*** |
0.057 |
2.23, 2.79 |
<0.001 |
| 3+ ACEs |
|
|
|
|
4.66*** |
0.060 |
4.14, 5.24 |
<0.001 |
| Survey year |
|
|
|
|
|
|
|
|
| 2020 |
|
|
|
|
— |
— |
— |
|
| 2021 |
|
|
|
|
1.06 |
0.042 |
0.98, 1.15 |
0.2 |
| 2022 |
|
|
|
|
1.16*** |
0.040 |
1.07, 1.25 |
<0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio, SE = Standard Error |
Primary Exposure: Household Language The crude-to-adjusted comparison
tells an important story:
Bilingual vs. English-only: OR barely changes from crude (0.37) to
adjusted (0.38), the protective effect is robust and not explained by
any of the covariates. Bilingual households have 62% lower odds of the
child having an impairing condition. Non-English vs. English-only: OR
shifts dramatically from 0.63 (crude, significant) to 0.96 (adjusted, p
= 0.80), the apparent crude protection completely disappears after
adjustment. This tells you the crude association was entirely
confounded, most likely by the younger age profile, lower income, and
lower ACE burden of Non-English households.
ACE Burden — Strongest Effect in the Model The dose-response gradient
is striking and all highly significant:
This is the largest effect in the entire model, larger even than the
language effect, and follows a textbook dose-response patter
Child Characteristics
Age: Each additional year increases odds by 9% (aOR = 1.09), older
children are more likely to have a diagnosed condition, reflecting
cumulative detection over time Sex: Girls have 30% lower odds than boys
(aOR = 0.70), consistent with the well-established male predominance in
neurodevelopmental and externalizing conditions Race: Black (aOR =
0.74), AIAN (aOR = 0.71), and Asian (aOR = 0.46) children all have
significantly lower odds than White children, likely reflecting
diagnostic and access disparities rather than true lower prevalence
Socioeconomic Factors
Income: Only the higher FPL categories (200–399% and 400%+) are
significant, both with aOR = 0.81; moderate and high income confers
modest protection vs. poverty Parent education: No significant effects
after adjustment, education’s influence is likely mediated through
income and ACE burden Insurance: Uninsured children have 33% lower odds
(aOR = 0.67), almost certainly reflects underdiagnosis rather than true
protection Family structure: Single parent households have slightly
lower odds (aOR = 0.89), counterintuitive, possibly reflecting similar
underdiagnosis dynamics Rural: Lower odds than urban (aOR = 0.90), again
likely a detection/access artifact
Survey Year
2022 shows modestly higher odds than 2020 (aOR = 1.16), could reflect
true increases in child mental health conditions post-pandemic, or
improved detection/reporting over time
After full adjustment, bilingual household language remained strongly
associated with lower odds of impairing mental/behavioral conditions
(aOR = 0.38, 95% CI: 0.31–0.47), while the crude association for
Non-English households was entirely attenuated (aOR = 0.96, 95% CI:
0.68–1.38), suggesting confounding by demographic factors. ACE burden
demonstrated the strongest independent association in the model, with
children in households reporting 3 or more ACEs facing over four times
the odds of impairing conditions compared to those with no ACEs (aOR =
4.66, 95% CI: 4.14–5.24).