Setup and Data Preparation

# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)
library(labelled)
library(scales)
library(survey)
library(srvyr)
library(gt)
library(patchwork)
library(scales)
library(labelled)
library(DiagrammeR)
library(DiagrammeRsvg)
library(rsvg)
library(tibble)
library(ggplot2)
library(flextable)
library(officer)
library(flextable)


options(survey.lonely.psu = "adjust")
options(scipen = 999)
set.seed(553)

Section 1: Data Loading and Analytical Sample

Seting working directory and path

project_folder <- "C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/NSCH Project Files"
data_folder    <- file.path(project_folder, "data")

dir.create(file.path(project_folder, "outputs", "tables"),  showWarnings = FALSE, recursive = TRUE)
dir.create(file.path(project_folder, "outputs", "figures"), showWarnings = FALSE, recursive = TRUE)

setwd(project_folder)
cat("Working directory:", getwd(), "\n")

## Working directory: C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/NSCH Project Files

Loading and stacking NSCH survey data (2020–2022)

data_folder <- "C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/NSCH Project Files/data"

nsch_2020 <- read_sas(file.path(data_folder, "nsch_2020_topical_SAS/nsch_2020e_topical.sas7bdat")) %>%
  filter(SC_AGE_YEARS >= 2, SC_AGE_YEARS <= 17) %>%
  mutate(YEAR = 2020)

nsch_2021 <- read_sas(file.path(data_folder, "nsch_2021_topical_SAS/nsch_2021e_topical.sas7bdat")) %>%
  filter(SC_AGE_YEARS >= 2, SC_AGE_YEARS <= 17) %>%
  mutate(YEAR = 2021)

nsch_2022 <- read_sas(file.path(data_folder, "nsch_2022_topical_SAS/nsch_2022e_topical.sas7bdat")) %>%
  filter(SC_AGE_YEARS >= 2, SC_AGE_YEARS <= 17) %>%
  mutate(YEAR = 2022)

# Stack years into one pooled dataset
nsch_combined <- bind_rows(nsch_2020, nsch_2021, nsch_2022)

cat("2020 N =", nrow(nsch_2020), "\n")

## 2020 N = 39738

cat("2021 N =", nrow(nsch_2021), "\n")

## 2021 N = 46488

cat("2022 N =", nrow(nsch_2022), "\n")

## 2022 N = 49659

cat("Combined N =", format(nrow(nsch_combined), big.mark = ","), "\n")

## Combined N = 135,885

rm(nsch_2020, nsch_2021, nsch_2022)
gc()

##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  2828167 151.1    4640658  247.9   4640658  247.9
## Vcells 74794352 570.7  171931001 1311.8 144789963 1104.7

Starting sample size in the raw dataset (the combined raw N).

Analytical Sample (exclusion)

# Start: pooled raw dataset
n_start <- nrow(nsch_combined)

# Exclusion 1: Age restriction (2–17 years)
nsch_step1 <- nsch_combined %>%
  filter(SC_AGE_YEARS >= 2, SC_AGE_YEARS <= 17)
n_step1 <- nrow(nsch_step1)

# Exclusion 2: Missing primary predictor (household language / English proficiency)
# Require HHLANGUAGE to be present.
# For non-English households (2 or 3), require SC_ENGLISH to classify language category.
nsch_step2 <- nsch_step1 %>%
  filter(!is.na(HHLANGUAGE)) %>%
  filter(!(HHLANGUAGE %in% c(2, 3) & is.na(SC_ENGLISH)))
n_step2 <- nrow(nsch_step2)

# Exclusion 3: Missing or zero survey weight
nsch_step3 <- nsch_step2 %>%
  filter(!is.na(FWC), FWC > 0)
n_step3 <- nrow(nsch_step3)

# Final analytic dataset
nsch_analysis <- nsch_step3
n_final <- nrow(nsch_analysis)

# Exclusion table
exclusion_table <- tibble(
  Step = c(
    "1. Raw combined sample (2020–2022)",
    "2. Exclude: age <2 or >17 years",
    "3. Exclude: missing household language or English proficiency",
    "4. Exclude: missing or zero survey weight (FWC)",
    "5. Final analytic sample"
  ),
  `N Remaining` = c(n_start, n_step1, n_step2, n_step3, n_final),
  `N Dropped`   = c(NA,
                    n_start - n_step1,
                    n_step1 - n_step2,
                    n_step2 - n_step3,
                    NA)
)

exclusion_table %>%
  kable(
    caption     = "Table S1. Analytical Sample Construction — Exclusion Flowchart",
    col.names   = c("Step", "N Remaining", "N Dropped"),
    format.args = list(big.mark = ","),
    na          = "—"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  ) %>%
  row_spec(5, bold = TRUE, background = "#d4edda")

Table S1. Analytical Sample Construction — Exclusion Flowchart
Step	N Remaining	N Dropped
Raw combined sample (2020–2022)	135,885	NA
Exclude: age <2 or >17 years	135,885	0
Exclude: missing household language or English proficiency	133,052	2,833
Exclude: missing or zero survey weight (FWC)	133,052	0
Final analytic sample	133,052	NA

Save Table_Flowchat

# ── SAVE OUTPUT ───────────────────────────────────────────────────

# 1) Save as HTML
exclusion_table %>%
  kable(
    caption     = "Table S1. Analytical Sample Construction — Exclusion Flowchart",
    col.names   = c("Step", "N Remaining", "N Dropped"),
    format.args = list(big.mark = ","),
    na          = "—",
    format      = "html"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  ) %>%
  row_spec(5, bold = TRUE, background = "#d4edda") %>%
  save_kable(
    file = file.path(project_folder, "outputs/tables/TableS1_Exclusion_Flowchart.html")
  )

# 2) Save as Word document
tableS1_ft <- exclusion_table %>%
  mutate(
    `N Remaining` = format(`N Remaining`, big.mark = ","),
    `N Dropped`   = ifelse(is.na(`N Dropped`), "—",
                           format(`N Dropped`, big.mark = ","))
  ) %>%
  flextable::flextable() %>%
  flextable::set_header_labels(
    Step         = "Step",
    `N Remaining` = "N Remaining",
    `N Dropped`   = "N Dropped"
  ) %>%
  flextable::bold(i = 5, part = "body") %>%          # bold final row
  flextable::bg(i = 5, bg = "#d4edda", part = "body") %>%  # green final row
  flextable::bg(i = 1, bg = "#f8f9fa", part = "header") %>%
  flextable::font(fontname = "Times New Roman", part = "all") %>%
  flextable::fontsize(size = 11, part = "all") %>%
  flextable::autofit() %>%
  flextable::theme_booktabs() %>%
  flextable::set_caption(
    "Table S1. Analytical Sample Construction — Exclusion Flowchart"
  )

doc_s1 <- officer::read_docx() %>%
  officer::body_add_par(
    "Table S1. Analytical Sample Construction — Exclusion Flowchart",
    style = "heading 2"
  ) %>%
  officer::body_add_par(" ", style = "Normal") %>%
  flextable::body_add_flextable(tableS1_ft) %>%
  officer::body_add_par(" ", style = "Normal") %>%
  officer::body_add_par(
    "NSCH 2020–2022. Raw combined sample includes all children aged 2–17 years across three survey waves. Exclusions applied sequentially. Final analytic sample retains only children with valid household language classification and non-zero survey weight.",
    style = "Normal"
  )

print(
  doc_s1,
  target = file.path(project_folder, "outputs/tables/TableS1_Exclusion_Flowchart.docx")
)

cat("Table S1 saved as HTML and Word successfully.\n")

## Table S1 saved as HTML and Word successfully.

Starting N -> Each exclusion criterion -> N dropped at each step -> Final N

Create Consort-style flow diagram

library(DiagrammeR)

flowchart <- grViz(sprintf("
digraph flowchart {

  graph [
    layout = dot,
    rankdir = TB,
    labelloc = t,
    label = <<B>Figure 1. Analytical Sample Selection</B><BR/>
             <FONT POINT-SIZE='12'>
             Household Language and Children's Mental and Behavioral Health<BR/>
             National Survey of Children's Health (NSCH), 2020-2022
             </FONT>>,
    fontsize = 18
  ]

  node [
    shape = box,
    style = filled,
    fillcolor = '#f8f9fa',
    fontname = Helvetica,
    fontsize = 12,
    width = 4
  ]

  A [label = 'Raw Combined Sample (2020-2022)\\nN = %s']
  B [label = 'Exclude: Age < 2 years\\nRemaining N = %s']
  C [label = 'Exclude: Missing Household Language or English Proficiency\\nRemaining N = %s']
  D [label = 'Exclude: Missing or Zero Survey Weight (FWC)\\nRemaining N = %s']
  E [label = 'Final Analytical Sample for Analysis\\nN = %s',
     fillcolor = '#d4edda',
     fontsize = 13]

  A -> B
  B -> C
  C -> D
  D -> E
}
",
format(n_start, big.mark = ","),
format(n_step1, big.mark = ","),
format(n_step2, big.mark = ","),
format(n_step3, big.mark = ","),
format(n_final, big.mark = ",")
))

flowchart

svg <- export_svg(flowchart)
rsvg_png(charToRaw(svg),
         file = "outputs/figures/Figure1_Analytical_Sample_Flow.png",
         width = 1200,
         height = 1800)

Creating a Working Subset

# Restrict to analytic variables and add survey design identifiers
nsch_subset <- nsch_analysis %>%
  select(
    # Survey design
    STRATUM, HHID, FWC, YEAR,

    # Outcomes (mental & behavioral health conditions)
    K2Q31A, K2Q31B,   # ADHD
    K2Q32A, K2Q32B,   # Depression
    K2Q33A, K2Q33B,   # Anxiety
    K2Q34A, K2Q34B,   # Behavioral/Conduct problems
    K2Q35A, K2Q35B,   # Autism
    K2Q36A, K2Q36B,   # Developmental delay

    # Primary predictor
    HHLANGUAGE, SC_ENGLISH,

    # Moderators
    METRO_YN, A1_MENTHEALTH, ACE1, ACE7,

    # Covariates
    SC_AGE_YEARS, SC_SEX, SC_RACE_R,
    FPL_I1, A1_GRADE, FAMILY_R,
    CURRINS, BORNUSA
  )

cat("Final analytic subset dimensions:",
    nrow(nsch_subset), "observations,",
    ncol(nsch_subset), "variables\n")

## Final analytic subset dimensions: 133052 observations, 30 variables

Section 2: Variable Selection, Recoding, and Data Cleaning

Outcome Variable

nsch_clean <- nsch_analysis %>%
  mutate(

    # Each condition requires BOTH diagnosis (A) AND current impairment (B)
    # NA in either source variable is treated as condition not present (0)
    has_adhd = case_when(
      K2Q31A == 1 & K2Q31B == 1 ~ 1,
      is.na(K2Q31A) | is.na(K2Q31B) ~ 0,
      TRUE ~ 0
    ),
    has_depression = case_when(
      K2Q32A == 1 & K2Q32B == 1 ~ 1,
      is.na(K2Q32A) | is.na(K2Q32B) ~ 0,
      TRUE ~ 0
    ),
    has_anxiety = case_when(
      K2Q33A == 1 & K2Q33B == 1 ~ 1,
      is.na(K2Q33A) | is.na(K2Q33B) ~ 0,
      TRUE ~ 0
    ),
    has_behavioral = case_when(
      K2Q34A == 1 & K2Q34B == 1 ~ 1,
      is.na(K2Q34A) | is.na(K2Q34B) ~ 0,
      TRUE ~ 0
    ),
    has_autism = case_when(
      K2Q35A == 1 & K2Q35B == 1 ~ 1,
      is.na(K2Q35A) | is.na(K2Q35B) ~ 0,
      TRUE ~ 0
    ),
    has_devdelay = case_when(
      K2Q36A == 1 & K2Q36B == 1 ~ 1,
      is.na(K2Q36A) | is.na(K2Q36B) ~ 0,
      TRUE ~ 0
    ),

    # Composite primary outcome
    impairing_mental_behav = ifelse(
      has_anxiety == 1 | has_depression == 1 | has_adhd == 1 |
      has_behavioral == 1 | has_autism == 1 | has_devdelay == 1, 1, 0),

    # Secondary outcomes by condition subtype
    internalizing      = ifelse(has_anxiety == 1 | has_depression == 1, 1, 0),
    externalizing      = ifelse(has_adhd == 1 | has_behavioral == 1, 1, 0),
    neurodevelopmental = ifelse(has_autism == 1 | has_devdelay == 1, 1, 0)
  )

# Verify zero NAs on all outcome variables
cat("NA check on outcome variables:\n")

## NA check on outcome variables:

nsch_clean %>%
  summarise(across(
    c(has_adhd, has_depression, has_anxiety, has_behavioral,
      has_autism, has_devdelay, impairing_mental_behav,
      internalizing, externalizing, neurodevelopmental),
    ~sum(is.na(.))
  )) %>%
  pivot_longer(everything(),
               names_to  = "variable",
               values_to = "n_missing") %>%
  print()

## # A tibble: 10 × 2
##    variable               n_missing
##    <chr>                      <int>
##  1 has_adhd                       0
##  2 has_depression                 0
##  3 has_anxiety                    0
##  4 has_behavioral                 0
##  5 has_autism                     0
##  6 has_devdelay                   0
##  7 impairing_mental_behav         0
##  8 internalizing                  0
##  9 externalizing                  0
## 10 neurodevelopmental             0

# Distribution of primary outcome
nsch_clean %>%
  count(impairing_mental_behav) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  kable(
    caption   = "Distribution of Primary Outcome: Any Impairing Mental/Behavioral Condition",
    col.names = c("Value", "N", "%"),
    format.args = list(big.mark = ",")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Distribution of Primary Outcome: Any Impairing Mental/Behavioral Condition
Value	N	%
0	104,458	78.5
1	28,594	21.5

Prints outcome variable as binary (0/1)

Primary Predictor: Household Language

# Household language (3-category factor)
# Derived from HHLANGUAGE (household language) and SC_ENGLISH
# (child English proficiency, asked only if HHLANGUAGE != 1)

nsch_clean <- nsch_clean %>%
  mutate(
    household_language = factor(case_when(
      HHLANGUAGE == 1                                     ~ "English only",
      HHLANGUAGE %in% c(2,3) & SC_ENGLISH %in% c(1, 2)  ~ "Bilingual",
      HHLANGUAGE %in% c(2,3) & SC_ENGLISH %in% c(3, 4)  ~ "Non-English"
    ), levels = c("English only", "Bilingual", "Non-English"))
  )

# Distribution of predictor
nsch_clean %>%
  count(household_language) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  kable(
    caption   = "Distribution of Primary Predictor: Household Language",
    col.names = c("Group", "N", "%"),
    format.args = list(big.mark = ",")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Distribution of Primary Predictor: Household Language
Group	N	%
English only	124,361	93.5
Bilingual	7,831	5.9
Non-English	860	0.6

Covariates

nsch_clean <- nsch_clean %>%
  mutate(
    # Child age (continuous, years)
    child_age = SC_AGE_YEARS,

    # Age group (categorical)
    age_group = factor(
      case_when(
        SC_AGE_YEARS >= 2  & SC_AGE_YEARS <= 5  ~ "2-5 years",
        SC_AGE_YEARS >= 6  & SC_AGE_YEARS <= 11 ~ "6-11 years",
        SC_AGE_YEARS >= 12 & SC_AGE_YEARS <= 17 ~ "12-17 years",
        TRUE ~ NA_character_
      ),
      levels = c("2-5 years", "6-11 years", "12-17 years")
    ),

    # Child sex (binary)
    child_sex = factor(
      case_when(
        SC_SEX == 1 ~ "Male",
        SC_SEX == 2 ~ "Female",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female")
    ),

    # Child race (SC_RACE_R: detailed race categories)
    child_race = factor(
      case_when(
        SC_RACE_R == 1 ~ "White alone",
        SC_RACE_R == 2 ~ "Black or African American alone",
        SC_RACE_R == 3 ~ "American Indian or Alaska Native alone",
        SC_RACE_R == 4 ~ "Asian alone",
        SC_RACE_R == 5 ~ "Native Hawaiian / Other Pacific Islander alone",
        SC_RACE_R == 7 ~ "Two or more races",
        TRUE ~ NA_character_
      ),
      levels = c(
        "White alone",
        "Black or African American alone",
        "American Indian or Alaska Native alone",
        "Asian alone",
        "Native Hawaiian / Other Pacific Islander alone",
        "Two or more races"
      )
    ),

    # Household income as % Federal Poverty Level (4-category)
    income_fpl = factor(
      case_when(
        FPL_I1 < 100                   ~ "0-99% FPL",
        FPL_I1 >= 100 & FPL_I1 < 200  ~ "100-199% FPL",
        FPL_I1 >= 200 & FPL_I1 < 400  ~ "200-399% FPL",
        FPL_I1 >= 400                  ~ "400%+ FPL",
        TRUE ~ NA_character_
      ),
      levels = c("0-99% FPL", "100-199% FPL", "200-399% FPL", "400%+ FPL")
    ),

    # Parent education (4-category)
    parent_education = factor(
      case_when(
        A1_GRADE %in% c(1, 2)     ~ "Less than HS",
        A1_GRADE %in% c(3, 4)     ~ "High school",
        A1_GRADE %in% c(5, 6)     ~ "Some college",
        A1_GRADE %in% c(7, 8, 9)  ~ "Bachelor's+",
        TRUE ~ NA_character_
      ),
      levels = c("Less than HS", "High school", "Some college", "Bachelor's+")
    ),

    # Family structure (3-category)
    family_structure = factor(
      case_when(
        FAMILY_R %in% c(1, 2, 3) ~ "Two parents",
        FAMILY_R %in% c(4, 5, 6) ~ "Single parent",
        FAMILY_R %in% c(7, 8)    ~ "Other",
        TRUE ~ NA_character_
      ),
      levels = c("Two parents", "Single parent", "Other")
    ),

    # Health insurance (binary)
    has_insurance = factor(
      case_when(
        CURRINS == 1 ~ "Insured",
        CURRINS == 2 ~ "Uninsured",
        TRUE ~ NA_character_
      ),
      levels = c("Insured", "Uninsured")
    ),

    # Child nativity (binary)
    child_nativity = factor(
      case_when(
        BORNUSA == 1 ~ "US-born",
        BORNUSA == 2 ~ "Foreign-born",
        TRUE ~ NA_character_
      ),
      levels = c("US-born", "Foreign-born")
    ),

    # Urbanicity (binary)
    urban_rural = factor(
      case_when(
        METRO_YN == 1 ~ "Urban",
        METRO_YN == 2 ~ "Rural",
        TRUE ~ NA_character_
      ),
      levels = c("Urban", "Rural")
    ),

    # Caregiver mental health (moderator)
    caregiver_mental_health = factor(
      case_when(
        A1_MENTHEALTH %in% c(1, 2, 3) ~ "Good (excellent/very good/good)",
        A1_MENTHEALTH %in% c(4, 5)    ~ "Poor (fair/poor)",
        TRUE ~ NA_character_
      ),
      levels = c("Good (excellent/very good/good)", "Poor (fair/poor)")
    ),

    # --- ACE individual items (binary 0/1) ---

    # ACE1: Hard to cover basics (4-point scale → binary)
    # Somewhat often (3) or Very often (4) = exposed
    # AC1 Economic hardship was originally a 4-point scale but it was 
    # dichotomized to exposed (1) and unexposed (0)
    ace1_bin = case_when(
      ACE1 %in% c(3, 4) ~ 1L,
      ACE1 %in% c(1, 2) ~ 0L,
      TRUE ~ NA_integer_
    ),

    # ACE3–ACE10: Binary Yes(1)/No(2) items
    ace3_bin  = case_when(ACE3  == 1 ~ 1L, ACE3  == 2 ~ 0L, TRUE ~ NA_integer_),  # Parent divorced/separated
    ace4_bin  = case_when(ACE4  == 1 ~ 1L, ACE4  == 2 ~ 0L, TRUE ~ NA_integer_),  # Parent died
    ace5_bin  = case_when(ACE5  == 1 ~ 1L, ACE5  == 2 ~ 0L, TRUE ~ NA_integer_),  # Parent jailed
    ace6_bin  = case_when(ACE6  == 1 ~ 1L, ACE6  == 2 ~ 0L, TRUE ~ NA_integer_),  # Domestic violence
    ace7_bin  = case_when(ACE7  == 1 ~ 1L, ACE7  == 2 ~ 0L, TRUE ~ NA_integer_),  # Victim/witness of violence
    ace8_bin  = case_when(ACE8  == 1 ~ 1L, ACE8  == 2 ~ 0L, TRUE ~ NA_integer_),  # Lived w/ mentally ill
    ace9_bin  = case_when(ACE9  == 1 ~ 1L, ACE9  == 2 ~ 0L, TRUE ~ NA_integer_),  # Lived w/ substance user
    ace10_bin = case_when(ACE10 == 1 ~ 1L, ACE10 == 2 ~ 0L, TRUE ~ NA_integer_),  # Racial discrimination

    # ACE burden: continuous count score (0-9)
    # na.rm = TRUE allows partial scoring across years where not all items were asked
    ace_burden = rowSums(
      cbind(ace1_bin, ace3_bin, ace4_bin, ace5_bin, ace6_bin,
            ace7_bin, ace8_bin, ace9_bin, ace10_bin),
      na.rm = TRUE
    ),

    # ACE burden: categorical (for descriptive table)
    ace_burden_cat = factor(
      case_when(
        ace_burden == 0 ~ "0 ACEs",
        ace_burden == 1 ~ "1 ACE",
        ace_burden == 2 ~ "2 ACEs",
        ace_burden >= 3 ~ "3+ ACEs",
        TRUE ~ NA_character_
      ),
      levels = c("0 ACEs", "1 ACE", "2 ACEs", "3+ ACEs")
    ),

    # Survey year
    survey_year = factor(YEAR),

    # Survey design variables
    survey_weight = FWC,
    STRATUM_F     = factor(STRATUM),
    HHID_F        = factor(HHID)
  )


# --- Continuous summary (age + ACE burden) ---
covariate_summary <- nsch_clean %>%
  summarise(
    `Mean age (SD)` = paste0(
      round(mean(child_age,   na.rm = TRUE), 1),
      " (", round(sd(child_age,   na.rm = TRUE), 1), ")"
    ),
    `Mean ACE burden (SD)` = paste0(
      round(mean(ace_burden, na.rm = TRUE), 2),
      " (", round(sd(ace_burden, na.rm = TRUE), 2), ")"
    )
  )

cat("Covariate summary computed.\n")

## Covariate summary computed.

# --- Categorical covariate table (N and %) ---
cov_vars <- c(
  "age_group", "child_sex", "child_race",
  "income_fpl", "caregiver_mental_health", "parent_education", "family_structure",
  "has_insurance", "child_nativity", "urban_rural", "ace_burden_cat", "survey_year"
)

covariate_table <- map_dfr(cov_vars, function(v) {
  nsch_clean %>%
    count(.data[[v]]) %>%
    mutate(
      variable = v,
      pct      = round(100 * n / sum(n), 1),
      level    = as.character(.data[[v]])
    ) %>%
    select(variable, level, n, pct)
})

covariate_table %>%
  kable(
    caption     = "Distribution of All Covariates (Unweighted)",
    col.names   = c("Variable", "Level", "N", "%"),
    format.args = list(big.mark = ",")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  ) %>%
  collapse_rows(columns = 1, valign = "top")

Distribution of All Covariates (Unweighted)
Variable	Level	N	%
age_group	2-5 years	38,692	29.1
	6-11 years	42,123	31.7
	12-17 years	52,237	39.3
child_sex	Male	68,841	51.7
child_sex	Female	64,211	48.3
child_race	White alone	102,263	76.9
	Black or African American alone	9,601	7.2
	American Indian or Alaska Native alone	1,294	1.0
	Asian alone	7,601	5.7
	Native Hawaiian / Other Pacific Islander alone	736	0.6
	Two or more races	11,557	8.7
income_fpl	0-99% FPL	16,461	12.4
	100-199% FPL	21,522	16.2
	200-399% FPL	39,880	30.0
	400%+ FPL	55,189	41.5
caregiver_mental_health	Good (excellent/very good/good)	120,056	90.2
	Poor (fair/poor)	8,583	6.5
	NA	4,413	3.3
parent_education	Less than HS	4,510	3.4
	High school	21,574	16.2
	Some college	31,783	23.9
	Bachelor’s+	75,185	56.5
family_structure	Two parents	95,752	72.0
	Single parent	28,776	21.6
	Other	4,914	3.7
	NA	3,610	2.7
has_insurance	Insured	126,663	95.2
	Uninsured	5,795	4.4
	NA	594	0.4
child_nativity	US-born	127,409	95.8
	Foreign-born	4,370	3.3
	NA	1,273	1.0
urban_rural	Urban	92,336	69.4
	Rural	20,344	15.3
	NA	20,372	15.3
ace_burden_cat	0 ACEs	84,399	63.4
	1 ACE	26,405	19.8
	2 ACEs	10,475	7.9
	3+ ACEs	11,773	8.8
survey_year	2020	39,090	29.4
	2021	45,447	34.2
	2022	48,515	36.5

Missing Data Assessment

updated missing

# Assess missingness across all analytic variables
analytic_vars <- c(
  "impairing_mental_behav", "internalizing", "externalizing",
  "neurodevelopmental", "household_language",
  "child_age", "child_sex", "age_group", "child_race",
  "income_fpl", "caregiver_mental_health", "parent_education",
  "family_structure", "has_insurance", "child_nativity",
  "urban_rural", "survey_year",
  "ace_burden", "ace_burden_cat"   
)

missing_summary <- nsch_clean %>%
  select(all_of(analytic_vars)) %>%
  summarise(across(
    everything(),
    list(
      n_miss   = ~sum(is.na(.)),
      pct_miss = ~round(100 * mean(is.na(.)), 2)
    )
  )) %>%
  pivot_longer(
    everything(),
    names_to  = c("variable", "stat"),
    names_sep = "_(?=n_miss|pct_miss)"
  ) %>%
  pivot_wider(names_from = stat, values_from = value) %>%
  arrange(desc(pct_miss))

missing_summary %>%
  kable(
    caption   = "Missing Data Summary — All Analytic Variables",
    col.names = c("Variable", "N Missing", "% Missing"),
    format.args = list(big.mark = ",")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  ) %>%
  row_spec(
    which(missing_summary$pct_miss >= 10),
    background = "#fff3cd"
  ) %>%
  row_spec(
    which(missing_summary$pct_miss == 0),
    background = "#d4edda"
  )

Missing Data Summary — All Analytic Variables
Variable	N Missing	% Missing
urban_rural	20,372	15.31
caregiver_mental_health	4,413	3.32
family_structure	3,610	2.71
child_nativity	1,273	0.96
has_insurance	594	0.45
impairing_mental_behav	0	0.00
internalizing	0	0.00
externalizing	0	0.00
neurodevelopmental	0	0.00
household_language	0	0.00
child_age	0	0.00
child_sex	0	0.00
age_group	0	0.00
child_race	0	0.00
income_fpl	0	0.00
parent_education	0	0.00
survey_year	0	0.00
ace_burden	0	0.00
ace_burden_cat	0	0.00

cat("Variables with >10% missing:\n")

## Variables with >10% missing:

missing_summary %>% filter(pct_miss >= 10) %>% print()

## # A tibble: 1 × 3
##   variable    n_miss pct_miss
##   <chr>        <dbl>    <dbl>
## 1 urban_rural  20372     15.3

cat("\nVariables with 0% missing:\n")

## 
## Variables with 0% missing:

missing_summary %>% filter(pct_miss == 0) %>% print()

## # A tibble: 14 × 3
##    variable               n_miss pct_miss
##    <chr>                   <dbl>    <dbl>
##  1 impairing_mental_behav      0        0
##  2 internalizing               0        0
##  3 externalizing               0        0
##  4 neurodevelopmental          0        0
##  5 household_language          0        0
##  6 child_age                   0        0
##  7 child_sex                   0        0
##  8 age_group                   0        0
##  9 child_race                  0        0
## 10 income_fpl                  0        0
## 11 parent_education            0        0
## 12 survey_year                 0        0
## 13 ace_burden                  0        0
## 14 ace_burden_cat              0        0

Section 3: Descriptive Statistics (Table 1)

Table 1

# 1) Pooled weights (3 years) 
nsch_clean <- nsch_clean %>%
  dplyr::mutate(weight_pooled = survey_weight / 3)

stopifnot("weight_pooled" %in% names(nsch_clean))

# 2) Survey design object 
nsch_design <- survey::svydesign(
  ids     = ~HHID_F,
  strata  = ~STRATUM_F,
  weights = ~weight_pooled,
  data    = nsch_clean,
  nest    = TRUE
)

# 3) Custom p-value function (continuous vars; 3+ groups) 
svy_wald_p <- function(data, variable, by, ...) {
  fml <- stats::as.formula(paste(variable, "~", by))
  fit <- survey::svyglm(fml, design = data)
  p   <- survey::regTermTest(fit, by)$p
  tibble::tibble(p.value = p)
}

# 4) Build Table 1 (weighted) 
table1 <- gtsummary::tbl_svysummary(
  nsch_design,
  by = household_language,
  include = c(
    impairing_mental_behav, internalizing, externalizing, neurodevelopmental,
    child_age, child_sex, age_group, child_race,
    income_fpl, caregiver_mental_health, parent_education, family_structure,
    has_insurance, child_nativity, urban_rural, ace_burden, ace_burden_cat, survey_year
  ),
  statistic = list(
    gtsummary::all_continuous()  ~ "{mean} ({sd})",
    gtsummary::all_categorical() ~ "{n} ({p}%)"
  ),
  digits = list(
    gtsummary::all_continuous()  ~ 1,
    gtsummary::all_categorical() ~ 1
  ),
  label = list(
    impairing_mental_behav  ~ "Any impairing mental/behavioral condition",
    internalizing           ~ "  Internalizing (anxiety or depression)",
    externalizing           ~ "  Externalizing (ADHD or behavioral problems)",
    neurodevelopmental      ~ "  Neurodevelopmental (autism or developmental delay)",
    child_age               ~ "Child age (years)",
    child_sex               ~ "Child sex",
    age_group               ~ "Age group",
    child_race              ~ "Child race",
    income_fpl              ~ "Household income (% Federal Poverty Level)",
    caregiver_mental_health ~ "Caregiver mental health",
    parent_education        ~ "Primary caregiver education",
    family_structure        ~ "Family structure",
    has_insurance           ~ "Health insurance status",
    child_nativity          ~ "Child nativity",
    urban_rural             ~ "Urbanicity",
    ace_burden              ~ "ACE burden score (continuous, 0-9)",
    ace_burden_cat          ~ "ACE burden category",
    survey_year             ~ "Survey year"
  ),
  missing = "no"
) %>%
  gtsummary::add_overall(last = FALSE) %>%
  gtsummary::add_p(
    test = list(
      gtsummary::all_continuous()  ~ "svy_wald_p",
      gtsummary::all_categorical() ~ "svy.chisq.test"
    ),
    pvalue_fun = gtsummary::label_style_pvalue(digits = 3)
  ) %>%
  gtsummary::bold_labels()          # correctly piped — not floating

# 5) Subtitle numbers (unweighted + weighted) 
unw_total <- nrow(nsch_clean)
unw_eng   <- sum(nsch_clean$household_language == "English only", na.rm = TRUE)
unw_bil   <- sum(nsch_clean$household_language == "Bilingual",    na.rm = TRUE)
unw_noeng <- sum(nsch_clean$household_language == "Non-English",  na.rm = TRUE)

wt_total <- round(sum(nsch_clean$weight_pooled, na.rm = TRUE) / 1e6, 1)
wt_eng   <- round(sum(nsch_clean$weight_pooled[nsch_clean$household_language == "English only"], na.rm = TRUE) / 1e6, 1)
wt_bil   <- round(sum(nsch_clean$weight_pooled[nsch_clean$household_language == "Bilingual"],    na.rm = TRUE) / 1e6, 1)
wt_noeng <- round(sum(nsch_clean$weight_pooled[nsch_clean$household_language == "Non-English"],  na.rm = TRUE) / 1e6, 1)

# 6) Render + save HTML 
table1_gt <- table1 %>%
  gtsummary::as_gt() %>%
  gt::tab_header(
    title    = "Table 1. Descriptive Characteristics of the Analytic Sample by Household Language",
    subtitle = paste0(
      "NSCH 2020-2022 | Unweighted N = ", format(unw_total, big.mark = ","),
      " | Weighted population (avg annual) = ", wt_total, " million",
      " | English only: n = ", format(unw_eng, big.mark = ","), " (", wt_eng, "M)",
      "; Bilingual: n = ",     format(unw_bil, big.mark = ","), " (", wt_bil, "M)",
      "; Non-English: n = ",   format(unw_noeng, big.mark = ","), " (", wt_noeng, "M)"
    )
  ) %>%
  gt::tab_source_note(
    source_note = gt::md(
      "Values are weighted mean (SD) for continuous variables and weighted n (%) for categorical variables.  
      P-values are design-adjusted: Rao-Scott chi-square tests for categorical variables and Wald tests from survey-weighted regression for continuous variables.  
      ACE burden is the count of 9 adverse childhood experience items (range 0-9); ACE1 dichotomized at 'somewhat/very often'.  
      Pooled weights were computed as annual weights / 3 for combined 2020-2022 analyses."
    )
  ) %>%
  gt::opt_row_striping() %>%
  gt::tab_options(
    table.font.size            = 12,
    heading.title.font.size    = 14,
    heading.subtitle.font.size = 11,
    data_row.padding           = gt::px(4)
  )

gt::gtsave(table1_gt,
           file = file.path(project_folder, "outputs/tables/table1_by_language.html"))
table1_gt

Characteristic	Overall N = 63,506,861¹	English only N = 55,117,665¹	Bilingual N = 7,413,253¹	Non-English N = 975,944¹	p-value²
Table 1. Descriptive Characteristics of the Analytic Sample by Household Language
NSCH 2020-2022 \| Unweighted N = 133,052 \| Weighted population (avg annual) = 63.5 million \| English only: n = 124,361 (55.1M); Bilingual: n = 7,831 (7.4M); Non-English: n = 860 (1M)
Any impairing mental/behavioral condition	12,461,074.2 (19.6%)	11,652,544.6 (21.1%)	668,150.7 (9.0%)	140,378.9 (14.4%)	<0.001
Internalizing (anxiety or depression)	6,393,064.8 (10.1%)	5,985,631.1 (10.9%)	374,493.5 (5.1%)	32,940.3 (3.4%)	<0.001
Externalizing (ADHD or behavioral problems)	7,641,799.3 (12.0%)	7,223,207.9 (13.1%)	338,236.1 (4.6%)	80,355.3 (8.2%)	<0.001
Neurodevelopmental (autism or developmental delay)	4,266,919.3 (6.7%)	3,950,479.7 (7.2%)	200,100.5 (2.7%)	116,339.1 (11.9%)	<0.001
Child age (years)	9.8 (4.5)	9.6 (4.6)	11.1 (3.8)	7.4 (3.8)	<0.001
Child sex					0.229
Male	32,385,772.1 (51.0%)	28,140,779.7 (51.1%)	3,698,042.9 (49.9%)	546,949.4 (56.0%)
Female	31,121,088.7 (49.0%)	26,976,884.8 (48.9%)	3,715,209.7 (50.1%)	428,994.2 (44.0%)
Age group					<0.001
2-5 years	14,337,576.6 (22.6%)	13,244,494.3 (24.0%)	646,547.9 (8.7%)	446,534.4 (45.8%)
6-11 years	24,017,499.4 (37.8%)	20,481,261.6 (37.2%)	3,180,905.0 (42.9%)	355,332.8 (36.4%)
12-17 years	25,151,784.9 (39.6%)	21,391,908.7 (38.8%)	3,585,799.7 (48.4%)	174,076.5 (17.8%)
Child race					<0.001
White alone	45,024,781.7 (70.9%)	38,984,982.3 (70.7%)	5,322,348.9 (71.8%)	717,450.5 (73.5%)
Black or African American alone	9,211,136.6 (14.5%)	8,778,862.8 (15.9%)	391,154.1 (5.3%)	41,119.8 (4.2%)
American Indian or Alaska Native alone	933,066.4 (1.5%)	676,950.8 (1.2%)	223,406.8 (3.0%)	32,708.8 (3.4%)
Asian alone	3,167,184.2 (5.0%)	1,983,944.5 (3.6%)	1,043,663.7 (14.1%)	139,576.1 (14.3%)
Native Hawaiian / Other Pacific Islander alone	555,660.5 (0.9%)	295,800.1 (0.5%)	228,501.5 (3.1%)	31,358.9 (3.2%)
Two or more races	4,615,031.4 (7.3%)	4,397,124.1 (8.0%)	204,177.7 (2.8%)	13,729.6 (1.4%)
Household income (% Federal Poverty Level)					<0.001
0-99% FPL	11,367,759.7 (17.9%)	8,482,758.4 (15.4%)	2,473,575.0 (33.4%)	411,426.3 (42.2%)
100-199% FPL	13,019,651.2 (20.5%)	10,335,712.7 (18.8%)	2,361,491.4 (31.9%)	322,447.1 (33.0%)
200-399% FPL	18,556,161.1 (29.2%)	16,797,650.1 (30.5%)	1,587,690.0 (21.4%)	170,820.9 (17.5%)
400%+ FPL	20,563,288.8 (32.4%)	19,501,543.3 (35.4%)	990,496.2 (13.4%)	71,249.3 (7.3%)
Caregiver mental health					<0.001
Good (excellent/very good/good)	56,596,561.7 (93.1%)	48,979,295.5 (92.7%)	6,718,058.6 (95.8%)	899,207.6 (95.0%)
Poor (fair/poor)	4,222,473.3 (6.9%)	3,882,737.9 (7.3%)	292,333.8 (4.2%)	47,401.6 (5.0%)
Primary caregiver education					<0.001
Less than HS	7,134,986.5 (11.2%)	3,540,925.1 (6.4%)	3,105,393.2 (41.9%)	488,668.2 (50.1%)
High school	13,880,949.6 (21.9%)	11,921,178.1 (21.6%)	1,736,522.2 (23.4%)	223,249.3 (22.9%)
Some college	13,102,039.4 (20.6%)	12,151,865.3 (22.0%)	868,514.7 (11.7%)	81,659.4 (8.4%)
Bachelor's+	29,388,885.3 (46.3%)	27,503,696.0 (49.9%)	1,702,822.6 (23.0%)	182,366.7 (18.7%)
Family structure					<0.001
Two parents	42,321,892.8 (69.0%)	36,797,907.3 (69.1%)	4,946,999.9 (69.6%)	576,985.6 (60.6%)
Single parent	16,016,035.2 (26.1%)	13,692,785.0 (25.7%)	2,001,731.9 (28.1%)	321,518.3 (33.8%)
Other	3,002,133.6 (4.9%)	2,785,384.9 (5.2%)	162,671.7 (2.3%)	54,077.0 (5.7%)
Health insurance status					<0.001
Insured	58,858,804.0 (93.3%)	51,846,033.7 (94.6%)	6,297,997.6 (85.8%)	714,772.7 (74.6%)
Uninsured	4,251,660.2 (6.7%)	2,967,318.7 (5.4%)	1,041,336.4 (14.2%)	243,005.0 (25.4%)
Child nativity					<0.001
US-born	59,626,942.7 (95.1%)	53,144,170.7 (97.6%)	5,894,519.3 (80.5%)	588,252.7 (61.8%)
Foreign-born	3,071,038.5 (4.9%)	1,280,048.6 (2.4%)	1,426,707.8 (19.5%)	364,282.1 (38.2%)
Urbanicity					<0.001
Urban	51,265,204.7 (87.4%)	43,888,436.8 (86.3%)	6,535,387.1 (94.7%)	841,380.8 (91.2%)
Rural	7,422,160.0 (12.6%)	6,976,375.3 (13.7%)	364,316.5 (5.3%)	81,468.1 (8.8%)
ACE burden score (continuous, 0-9)	0.8 (1.3)	0.8 (1.3)	0.5 (0.9)	0.5 (0.9)	<0.001
ACE burden category					<0.001
0 ACEs	38,157,489.9 (60.1%)	32,709,152.7 (59.3%)	4,769,265.7 (64.3%)	679,071.5 (69.6%)
1 ACE	13,960,413.6 (22.0%)	11,975,665.4 (21.7%)	1,781,320.1 (24.0%)	203,428.2 (20.8%)
2 ACEs	5,473,199.6 (8.6%)	4,891,229.0 (8.9%)	516,536.7 (7.0%)	65,433.9 (6.7%)
3+ ACEs	5,915,757.6 (9.3%)	5,541,617.5 (10.1%)	346,130.1 (4.7%)	28,010.0 (2.9%)
Survey year					0.676
2020	21,133,243.9 (33.3%)	18,379,482.8 (33.3%)	2,449,804.9 (33.0%)	303,956.2 (31.1%)
2021	21,046,813.5 (33.1%)	18,176,221.6 (33.0%)	2,504,523.9 (33.8%)	366,068.0 (37.5%)
2022	21,326,803.4 (33.6%)	18,561,960.2 (33.7%)	2,458,923.8 (33.2%)	305,919.4 (31.3%)
¹ n (%); Mean (SD)
² Pearson’s X^2: Rao & Scott adjustment
Values are weighted mean (SD) for continuous variables and weighted n (%) for categorical variables. P-values are design-adjusted: Rao-Scott chi-square tests for categorical variables and Wald tests from survey-weighted regression for continuous variables. ACE burden is the count of 9 adverse childhood experience items (range 0-9); ACE1 dichotomized at ‘somewhat/very often’. Pooled weights were computed as annual weights / 3 for combined 2020-2022 analyses.

# 7) Export Table 1 to Word 
library(officer)      # fixes: could not find function "read_docx"
library(flextable)

# Convert gtsummary table to flextable
table1_ft <- table1 %>%
  gtsummary::as_flex_table() %>%
  flextable::autofit() %>%
  flextable::font(fontname = "Times New Roman", part = "all") %>%
  flextable::fontsize(size = 10, part = "all") %>%
  flextable::theme_booktabs()

# Build Word document
doc <- officer::read_docx()

doc <- officer::body_add_par(
  doc,
  "Table 1. Descriptive Characteristics of the Analytic Sample by Household Language (NSCH 2020-2022)",
  style = "heading 2"
)

doc <- officer::body_add_par(doc, " ", style = "Normal")

doc <- flextable::body_add_flextable(doc, table1_ft)

doc <- officer::body_add_par(doc, " ", style = "Normal")

doc <- officer::body_add_par(
  doc,
  "Values are weighted mean (SD) for continuous variables and weighted n (%) for categorical variables. P-values are design-adjusted: Rao-Scott chi-square tests for categorical variables and Wald tests from survey-weighted regression for continuous variables. ACE burden is the count of 9 adverse childhood experience items (range 0-9); ACE1 dichotomized at 'somewhat/very often'. Pooled weights computed as annual weights / 3 for combined 2020-2022 analyses.",
  style = "Normal"
)

# Save
print(
  doc,
  target = file.path(project_folder, "outputs/tables/Table1_NSCH_Descriptives.docx")
)

cat("Table 1 saved to Word successfully.\n")

## Table 1 saved to Word successfully.

svymean(~is.na(caregiver_mental_health), nsch_design)

##                                         mean     SE
## is.na(caregiver_mental_health)FALSE 0.957677 0.0013
## is.na(caregiver_mental_health)TRUE  0.042323 0.0013

** Caregiver mental health was missing for approximately 4.2% of the weighted sample and will be excluded from interaction analyses using complete cases only

Section 4: Exploratory Data Analysis (EDA)

EDA Set Up

# ── EDA SETUP ────────────────────────────────────────────────────────────────
theme_eda <- theme_minimal(base_size = 13) +
  theme(
    plot.title       = element_text(face = "bold", size = 13),
    plot.subtitle    = element_text(size = 11, color = "gray40"),
    axis.title       = element_text(size = 11),
    axis.text        = element_text(size = 10),
    legend.position  = "bottom",
    panel.grid.minor = element_blank(),
    plot.caption     = element_text(size = 9, color = "gray50")
  )

lang_colors <- c(
  "English only" = "#C0392B",
  "Bilingual"    = "#8E44AD",
  "Non-English"  = "#27AE60"
)

nsch_clean <- nsch_clean %>%
  mutate(weight_pooled = survey_weight / 3)

nsch_design <- svydesign(
  ids     = ~HHID_F,
  strata  = ~STRATUM_F,
  weights = ~weight_pooled,
  data    = nsch_clean,
  nest    = TRUE
)

options(survey.lonely.psu = "adjust")
cat("EDA setup complete.\n")

## EDA setup complete.

Interpretation: This block establishes a consistent visual theme and color palette for all EDA figures, ensuring figures are publication-ready and visually coherent. The survey design object (nsch_design) correctly incorporates NSCH’s complex survey structure — stratification, clustering by household, and pooled annual weights, so that all subsequent estimates are nationally representative. Failing to account for the complex design would produce artificially narrow confidence intervals and potentially biased prevalence estimates.

Required Visualization

Figure 1a — Weighted Prevalence of the Binary Outcome

# ── Figure 1: Outcome distribution (binary) ──────────────────────────────────
# Design-based prevalence estimate
est1 <- svymean(~impairing_mental_behav, nsch_design, na.rm = TRUE)

p1 <- as.numeric(coef(est1))
se1 <- as.numeric(SE(est1))

fig1_df <- tibble::tibble(
  Outcome = c("No impairing condition", "Any impairing condition"),
  Prev    = c(1 - p1, p1),
  SE      = c(se1, se1)
) %>%
  mutate(
    low  = pmax(Prev - 1.96 * SE, 0),
    high = pmin(Prev + 1.96 * SE, 1),
    pct  = round(Prev * 100, 1)
  )

fig1a <- ggplot(fig1_df, aes(x = Outcome, y = Prev, fill = Outcome)) +
  geom_col(width = 0.60, color = "white") +
  geom_errorbar(aes(ymin = low, ymax = high), width = 0.18, linewidth = 0.8) +
  geom_text(aes(label = paste0(pct, "%")), vjust = -0.7,
            fontface = "bold", size = 5) +
  scale_y_continuous(labels = percent_format(accuracy = 1),
                     limits = c(0, 1),
                     expand = expansion(mult = c(0, 0.10))) +
  scale_fill_manual(values = c(
    "No impairing condition"  = "gray75",
    "Any impairing condition" = "#B22222"
  )) +
  labs(
    title = "Figure 1a. Weighted Distribution of Any Currently Impairing Condition",
    subtitle = "NSCH 2020–2022 (children ages 2–17), survey-weighted estimate",
    x = NULL,
    y = "Weighted proportion of children",
    caption = "Error bars = 95% CI from complex survey design using pooled weights (FWC/3)."
  ) 

fig1a

ggsave(file.path(project_folder, "outputs/figures/Figure1_Binary_Outcome.png"),
       fig1a, width = 9, height = 5.5, dpi = 300, bg = "white")

The weighted prevalence of any currently impairing mental or behavioral condition in the analytic sample is 19.6%, meaning approximately one in five children meets the case definition. The remaining 80.4% of children do not have an active impairing condition at the time of the survey. Because the outcome is binary, it does not follow a normal distribution and cannot be evaluated for skewness in the same way as a continuous variable. This confirms that logistic regression, rather than linear regression, is the appropriate modeling strategy for subsequent analyses.

The outcome is binary, so its distribution is best summarized as a weighted prevalence rather than a histogram. Approximately one in five U.S. children (about 19–20%) meets the study’s case definition of a currently impairing condition, while about four in five do not. This confirms that logistic regression is appropriate for modeling the outcome in later sections. The relatively common prevalence also implies odds ratios may differ from risk ratios, so interpretation should be careful.

Figure 1b — Histogram of Concurrent Condition Count (0–6)

# Build additive condition count variable (0–6)
nsch_clean <- nsch_clean %>%
  mutate(condition_count = has_adhd + has_depression + has_anxiety +
                           has_behavioral + has_autism + has_devdelay)

count_df <- nsch_clean %>%
  count(condition_count) %>%
  mutate(pct = 100 * n / sum(n))

fig1b <- ggplot(count_df, aes(x = factor(condition_count), y = pct,
                               fill = factor(condition_count))) +
  geom_col(color = "white", show.legend = FALSE) +
  geom_text(aes(label = paste0(round(pct, 1), "%")),
            vjust = -0.5, size = 3.8) +
  scale_fill_manual(values = c("#BDC3C7", "#F1948A", "#E74C3C", "#C0392B",
                                "#922B21", "#6E1C1C", "#4A0E0E")) +
  scale_y_continuous(labels = label_number(suffix = "%"),
                     expand = expansion(mult = c(0, 0.1))) +
  labs(
    title    = "Figure 1b. Number of Concurrent Impairing Conditions per Child (Unweighted)",
    subtitle = "Distribution is strongly right-skewed; most children have 0 or 1 condition",
    x        = "Number of Impairing Conditions (0–6)",
    y        = "Percent of Children",
    caption  = "Conditions: ADHD, depression, anxiety, behavioral problems, autism, developmental delay."
  ) +
  theme_eda

fig1b

ggsave("outputs/figures/Figure2b_Condition_Count_Histogram.png",
       fig1b, width = 8, height = 5, dpi = 300)

Interpretation: The distribution of concurrent impairing conditions is strongly right-skewed: the overwhelming majority of children have zero conditions, a meaningful minority have exactly one, and very few children carry two or more simultaneous diagnoses. This floor effect (mass at zero) would severely violate the normality assumption if the count were modeled as a continuous outcome, requiring at minimum a zero-inflated Poisson or negative-binomial approach. The strong skew confirms that collapsing the count into a binary “any vs. none” composite is both statistically appropriate and clinically meaningful for the primary analysis. Children with multiple concurrent conditions represent a small but high-need subgroup worth examining in sensitivity analyses.

Figure 1c — Weighted Prevalence by Individual Condition

condition_vars   <- c("has_adhd", "has_depression", "has_anxiety",
                      "has_behavioral", "has_autism", "has_devdelay")
condition_labels <- c("ADHD", "Depression", "Anxiety",
                      "Behavioral\nProblems", "Autism", "Developmental\nDelay")

cond_prev <- map2_dfr(condition_vars, condition_labels, function(var, lab) {
  est <- svymean(as.formula(paste0("~", var)), nsch_design, na.rm = TRUE)
  data.frame(
    Condition  = lab,
    Prevalence = as.numeric(coef(est)),   # coerce to plain numeric
    SE         = as.numeric(SE(est))      # coerce to plain numeric — THIS was the fix
  )
}) %>%
  mutate(
    lower     = pmax(Prevalence - 1.96 * SE, 0),
    upper     = Prevalence + 1.96 * SE,
    Condition = fct_reorder(Condition, Prevalence, .desc = TRUE)
  )

fig1c <- ggplot(cond_prev, aes(x = Condition, y = Prevalence)) +
  geom_col(fill = "#C0392B", width = 0.6, color = "white") +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, linewidth = 0.8) +
  geom_text(aes(label = paste0(round(Prevalence * 100, 1), "%")),
            vjust = -0.9, size = 3.8, fontface = "bold") +
  scale_y_continuous(labels = percent_format(),
                     expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Figure 1c. Weighted Prevalence of Each Mental/Behavioral Condition",
    subtitle = "NSCH 2020–2022; each condition requires diagnosis + current impairment",
    x        = NULL,
    y        = "Weighted Prevalence",
    caption  = "Error bars = 95% CI (design-adjusted)."
  ) +
  theme_eda

fig1c

ggsave("outputs/figures/Figure2c_Condition_Prevalence.png",
       fig1c, width = 8, height = 5, dpi = 300)

Interpretation: ADHD and anxiety are the two most prevalent individual conditions, each affecting roughly 9–11% of children nationally, while depression, behavioral problems, autism, and developmental delay are each below 7%. This hierarchy is consistent with published national epidemiological benchmarks (e.g., CDC ADHD surveillance data), which validates the NSCH coding applied in Section 2. Practically, because ADHD and anxiety together drive the composite outcome, the composite will be most sensitive to factors that predict those two conditions; associations with rarer diagnoses (autism, developmental delay) may be diluted and are better examined through the neurodevelopmental secondary outcome. The non-overlapping confidence intervals between ADHD/anxiety and the rarer conditions confirm that these prevalence differences are statistically reliable.

Figure 2 — Main bivariate association: outcome by household language (weighted + CI)

# ── Figure 2: Outcome by household language (weighted + 95% CI) ──────────────
prev_lang <- svyby(
  ~impairing_mental_behav,
  ~household_language,
  design = nsch_design,
  FUN = svymean,
  na.rm = TRUE,
  vartype = c("ci")
) %>%
  as.data.frame() %>%
  transmute(
    household_language,
    prev = impairing_mental_behav,
    low  = ci_l,
    high = ci_u,
    prev_pct = 100 * prev,
    low_pct  = 100 * low,
    high_pct = 100 * high
  )

ymax <- max(prev_lang$high_pct, na.rm = TRUE) * 1.25

fig2 <- ggplot(prev_lang,
               aes(x = household_language, y = prev_pct, fill = household_language)) +
  geom_col(width = 0.62, color = "white") +
  geom_errorbar(aes(ymin = low_pct, ymax = high_pct),
                width = 0.16, linewidth = 0.85, color = "gray25") +
  geom_text(aes(label = paste0(round(prev_pct, 1), "%")),
            vjust = -0.8, fontface = "bold", size = 5) +
  scale_fill_manual(values = lang_colors) +
  scale_y_continuous(
    limits = c(0, ymax),
    labels = function(x) paste0(x, "%"),
    expand = expansion(mult = c(0, 0.06))
  ) +
  labs(
    title = "Figure 2. Any Impairing Condition by Household Language Environment",
    subtitle = "Survey-weighted prevalence with 95% confidence intervals (NSCH 2020–2022)",
    x = "Household language group",
    y = "Weighted prevalence (%)",
    caption = "Error bars = 95% CI accounting for NSCH stratification, clustering, and pooled weights (FWC/3)."
  ) 

fig2

ggsave(file.path(project_folder, "outputs/figures/Figure2_Outcome_by_Language.png"),
       fig2, width = 9.5, height = 6, dpi = 300, bg = "white")

Interpretation: Children in bilingual and non-English households show lower crude survey-weighted prevalence of an impairing condition compared with children in English-only households. This pattern is consistent with the expected direction from the immigrant health paradox literature. However, these are unadjusted comparisons and may reflect differences in socioeconomic composition or barriers to diagnosis and screening across language groups. The regression models will test whether this gap persists after controlling for confounders.

Figure 3 — Subtype prevalence by language (weighted + CI)

# ── Figure 3: Condition subtypes by language (weighted + 95% CI) ─────────────
get_prev <- function(varname) {
  out <- svyby(
    as.formula(paste0("~", varname)),
    ~household_language,
    design = nsch_design,
    FUN = svymean,
    na.rm = TRUE,
    vartype = c("ci")
  ) %>% as.data.frame()

  dplyr::tibble(
    household_language = out$household_language,
    subtype = varname,
    prev = out[[varname]],
    low  = out$ci_l,
    high = out$ci_u
  )
}

subtype_df <- dplyr::bind_rows(
  get_prev("internalizing"),
  get_prev("externalizing"),
  get_prev("neurodevelopmental")
) %>%
  mutate(
    subtype_label = dplyr::case_when(
      subtype == "internalizing"      ~ "Internalizing (anxiety/depression)",
      subtype == "externalizing"      ~ "Externalizing (ADHD/behavioral)",
      subtype == "neurodevelopmental" ~ "Neurodevelopmental (autism/dev. delay)",
      TRUE ~ subtype
    ),
    prev_pct = 100 * prev,
    low_pct  = 100 * low,
    high_pct = 100 * high,
    subtype_label = factor(
      subtype_label,
      levels = c("Internalizing (anxiety/depression)",
                 "Externalizing (ADHD/behavioral)",
                 "Neurodevelopmental (autism/dev. delay)")
    )
  )

ymax3 <- max(subtype_df$high_pct, na.rm = TRUE) * 1.25

fig3 <- ggplot(subtype_df,
               aes(x = subtype_label, y = prev_pct, fill = household_language)) +
  geom_col(position = position_dodge(width = 0.75),
           width = 0.65, color = "white") +
  geom_errorbar(aes(ymin = low_pct, ymax = high_pct),
                position = position_dodge(width = 0.75),
                width = 0.18, linewidth = 0.8, color = "gray25") +
  geom_text(aes(label = paste0(round(prev_pct, 1), "%")),
            position = position_dodge(width = 0.75),
            vjust = -0.75, size = 3.6, fontface = "bold") +
  scale_fill_manual(values = lang_colors, name = "Household language") +
  scale_y_continuous(
    limits = c(0, ymax3),
    labels = function(x) paste0(x, "%"),
    expand = expansion(mult = c(0, 0.06))
  ) +
  labs(
    title = "Figure 3. Condition Subtype Prevalence by Household Language",
    subtitle = "Survey-weighted prevalence with 95% confidence intervals (NSCH 2020–2022)",
    x = "Outcome subtype",
    y = "Weighted prevalence (%)",
    caption = "Subtypes are derived from the same diagnosis + current impairment rule.\nError bars = 95% CI accounting for NSCH complex survey design and pooled weights (FWC/3)."
  ) 

fig3

ggsave(file.path(project_folder, "outputs/figures/Figure3_Subtypes_by_Language.png"),
       fig3, width = 12, height = 6, dpi = 300, bg = "white")

Interpretation: The language gradient differs by subtype: the contrast between English-only and bilingual/non-English households may be larger for some condition categories than others. This matters because your composite outcome can be driven mostly by the more common subtypes (often internalizing or externalizing), while neurodevelopmental conditions are less common and often have wider uncertainty. If subtype patterns differ meaningfully, it supports your plan to run subtype-specific secondary models. These bivariate results still require adjusted regression to separate language effects from confounding and diagnostic access differences.

Figure 4: Outcome by age group x language (weighted + CI)

# ── Optional Figure 4: Outcome by age group x language (weighted + CI) ───────
prev_age_lang <- svyby(
  ~impairing_mental_behav,
  ~age_group + household_language,
  design = nsch_design,
  FUN = svymean,
  na.rm = TRUE,
  vartype = c("ci")
) %>%
  as.data.frame() %>%
  mutate(
    prev_pct = 100 * impairing_mental_behav,
    low_pct  = 100 * ci_l,
    high_pct = 100 * ci_u
  )

ymax4 <- max(prev_age_lang$high_pct, na.rm = TRUE) * 1.25

fig4 <- ggplot(prev_age_lang,
               aes(x = age_group, y = prev_pct,
                   color = household_language, group = household_language)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 3.2) +
  geom_errorbar(aes(ymin = low_pct, ymax = high_pct),
                width = 0.10, linewidth = 0.7, alpha = 0.9) +
  geom_text(aes(label = paste0(round(prev_pct, 1), "%")),
            vjust = -0.9, size = 3.4, show.legend = FALSE) +
  scale_color_manual(values = lang_colors, name = "Household language") +
  scale_y_continuous(
    limits = c(0, ymax4),
    labels = function(x) paste0(x, "%"),
    expand = expansion(mult = c(0.02, 0.06))
  ) +
  labs(
    title = "Figure 4. Any Impairing Condition by Age Group and Household Language",
    subtitle = "Survey-weighted prevalence with 95% confidence intervals (NSCH 2020–2022)",
    x = "Age group (years)",
    y = "Weighted prevalence (%)",
    caption = "Error bars = 95% CI from complex survey design using pooled weights (FWC/3)."
  )

fig4

ggsave(file.path(project_folder, "outputs/figures/Figure4_Age_by_Language.png"),
       fig4, width = 10, height = 6, dpi = 300, bg = "white")

Across all three household language environments, the prevalence of any currently impairing mental or behavioral condition increases substantially with age. This age gradient is expected epidemiologically, as internalizing disorders such as anxiety and depression typically emerge more clearly in middle childhood and adolescence, and diagnostic recognition increases with school exposure and health system contact.

Among children ages 2–5 years, prevalence is relatively similar across English-only (9.1%) and non-English (9.4%) households, with bilingual households lower (4.1%). At this early developmental stage, confidence intervals overlap considerably, suggesting limited differentiation by language environment in early childhood.

The divergence becomes more pronounced beginning at ages 6–11 years. English-only households show a prevalence of 21.2%, compared with 17.2% in non-English households and only 6.3% in bilingual households. By adolescence (12–17 years), the gap widens further: English-only households reach 28.4%, non-English households 22.4%, and bilingual households 12.5%.

Three important patterns emerge:

First, the age slope is steepest in English-only households. Prevalence triples from early childhood to adolescence. This suggests either greater cumulative risk exposure, greater diagnostic ascertainment, or both.

Second, bilingual households consistently show the lowest prevalence at every age group, and the gap widens with age. This pattern is consistent with a potential protective gradient associated with bilingual environments, though unadjusted.

Third, non-English households fall between English-only and bilingual households at older ages. However, note the wide confidence intervals in the non-English adolescent group — this reflects the small sample size (approximately 0.6% of the analytic sample), and those estimates should be interpreted cautiously.

From a modeling perspective, this figure suggests two key implications:

• Age must be included as a covariate in all regression models. • There is visual evidence of possible age × language interaction (non-parallel lines). The widening gap in adolescence suggests the association between household language and mental health may strengthen over developmental stages.

Substantively, the increasing gap with age is consistent with literature showing that cultural protective factors (family cohesion, collectivism, reduced externalizing behavior) may become more influential during adolescence, while also reflecting differential access to diagnosis across language environments (Alegría et al., 2010; Marks et al., 2014).

Importantly, this is still an unadjusted figure. Income, parental education, race/ethnicity, and caregiver mental health are unevenly distributed across language groups. The apparent protective pattern could attenuate or strengthen once those confounders are controlled in Section 5 regression models.

In summary, the figure supports your core hypothesis visually: children in bilingual and non-English households appear to have lower crude prevalence of impairing conditions, and the difference becomes more pronounced in adolescence. However, formal interaction testing in logistic regression will be required to confirm whether this age-dependent pattern is statistically significant after adjustment.

Figure 5 — Outcome by Language × Child Race

race_lang_prev <- map_dfr(
  levels(nsch_clean$household_language), function(grp) {
    map_dfr(levels(nsch_clean$child_race), function(race) {
      sub_data <- nsch_clean %>%
        filter(household_language == grp, child_race == race)
      if (nrow(sub_data) < 30) return(NULL)
      sub_design2 <- svydesign(
        ids     = ~HHID_F,
        strata  = ~STRATUM_F,
        weights = ~weight_pooled,
        data    = sub_data,
        nest    = TRUE
      )
      est <- svymean(~impairing_mental_behav, sub_design2, na.rm = TRUE)
      data.frame(
        Language   = grp,
        Race       = race,
        Prevalence = as.numeric(coef(est)),  # fix
        SE         = as.numeric(SE(est)),    # fix
        n          = nrow(sub_data)
      )
    })
  }
) %>%
  mutate(
    lower    = pmax(Prevalence - 1.96 * SE, 0),
    upper    = Prevalence + 1.96 * SE,
    Language = factor(Language, levels = c("English only", "Bilingual", "Non-English")),
    Race = case_when(
      Race == "White alone"                                    ~ "White",
      Race == "Black or African American alone"                ~ "Black",
      Race == "American Indian or Alaska Native alone"         ~ "AIAN",
      Race == "Asian alone"                                    ~ "Asian",
      Race == "Native Hawaiian / Other Pacific Islander alone" ~ "NHOPI",
      Race == "Two or more races"                              ~ "Multiracial",
      TRUE ~ Race
    )
  )

fig5 <- ggplot(race_lang_prev,
               aes(x = Race, y = Prevalence, color = Language, group = Language)) +
  geom_line(linewidth = 1.1) +
  geom_point(aes(size = n), alpha = 0.85) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.25, linewidth = 0.6) +
  scale_color_manual(values = lang_colors, name = "Household Language") +
  scale_size_continuous(name = "Unweighted n", range = c(2, 7),
                        labels = comma_format()) +
  scale_y_continuous(labels = percent_format(),
                     expand = expansion(mult = c(0.02, 0.1))) +
  labs(
    title    = "Figure 5. Prevalence of Any Impairing Condition by Household Language and Child Race",
    subtitle = "NSCH 2020–2022 (design-adjusted; cells with n < 30 excluded)",
    x        = "Child Race/Ethnicity",
    y        = "Weighted Prevalence",
    caption  = "Error bars = 95% CI. Point size proportional to unweighted cell n."
  ) +
  theme_eda +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

fig5

ggsave("outputs/figures/Figure4_Outcome_by_Language_Race.png",
       fig5, width = 10, height = 6, dpi = 300)

Interpretation: The language–outcome gradient is not uniform across racial groups, which is the visual signature of effect modification. White children in English-only households show among the highest prevalences overall, while Asian children in non-English households show the lowest, consistent with the immigrant paradox being strongest in recently arrived, non-White immigrant communities. The three language lines are non-parallel across racial categories — for example, the gap between English-only and non-English households is wider for White and multiracial children than for Black children — which motivates testing a formal language × race interaction term in adjusted regression models. Cells for American Indian/Alaska Native and Native Hawaiian/Pacific Islander children should be interpreted cautiously given small unweighted sample sizes, and may need to be collapsed or excluded in formal models to ensure stable estimates.

Figure 6 — Outcome by Language × Household Income

income_lang_prev <- map_dfr(
  levels(nsch_clean$household_language), function(grp) {
    map_dfr(levels(nsch_clean$income_fpl), function(inc) {
      sub_data <- nsch_clean %>%
        filter(household_language == grp, income_fpl == inc)
      if (nrow(sub_data) < 30) return(NULL)
      sub_design3 <- svydesign(
        ids     = ~HHID_F,
        strata  = ~STRATUM_F,
        weights = ~weight_pooled,
        data    = sub_data,
        nest    = TRUE
      )
      est <- svymean(~impairing_mental_behav, sub_design3, na.rm = TRUE)
      data.frame(
        Language   = grp,
        Income     = inc,
        Prevalence = as.numeric(coef(est)),  # fix
        SE         = as.numeric(SE(est)),    # fix
        n          = nrow(sub_data)
      )
    })
  }
) %>%
  mutate(
    lower    = pmax(Prevalence - 1.96 * SE, 0),
    upper    = Prevalence + 1.96 * SE,
    Language = factor(Language, levels = c("English only", "Bilingual", "Non-English")),
    Income   = factor(Income,   levels = c("0-99% FPL", "100-199% FPL",
                                            "200-399% FPL", "400%+ FPL"))
  )

fig6 <- ggplot(income_lang_prev,
               aes(x = Income, y = Prevalence, fill = Language)) +
  geom_col(position = position_dodge(0.85), width = 0.72, color = "white") +
  geom_errorbar(aes(ymin = lower, ymax = upper),
                position = position_dodge(0.85), width = 0.25, linewidth = 0.65) +
  geom_text(aes(label = paste0(round(Prevalence * 100, 1), "%")),
            position = position_dodge(0.85),
            vjust = -0.7, size = 3, fontface = "bold") +
  scale_fill_manual(values = lang_colors, name = "Household Language") +
  scale_y_continuous(labels = percent_format(),
                     expand = expansion(mult = c(0, 0.18))) +
  labs(
    title    = "Figure 6. Prevalence of Any Impairing Condition by Household Language and Household Income",
    subtitle = "NSCH 2020–2022 (design-adjusted weighted estimates)",
    x        = "Household Income (% Federal Poverty Level)",
    y        = "Weighted Prevalence",
    caption  = "Error bars = 95% CI."
  ) +
  theme_eda +
  theme(axis.text.x = element_text(angle = 15, hjust = 1))

fig6

ggsave("outputs/figures/Figure5_Outcome_by_Language_Income.png",
       fig6, width = 10, height = 6, dpi = 300)

Interpretation: This figure was chosen to directly test whether the crude language–outcome association is simply a proxy for socioeconomic disadvantage, since non-English households are disproportionately low-income. The language gradient persists within every income stratum: even among the lowest-income children (0–99% FPL), those in non-English households have lower diagnosed prevalences than English-only children in the same income band. This within-stratum persistence suggests that income alone does not fully explain the association, pointing instead to cultural, linguistic, or acculturation-related mechanisms. Nonetheless, income still acts as a confounder because it is unevenly distributed across language groups, so it must be included in all adjusted regression models.

Part 2 Regression Analysis

STEP 1 — Confirm Survey Design (Final Check Before Modeling)

# ---------------------------------------
# Define complete-case regression sample
# ---------------------------------------
analysis_complete <- nsch_clean %>%
  filter(
    !is.na(caregiver_mental_health),
    !is.na(urban_rural),
    !is.na(income_fpl),
    !is.na(parent_education),
    !is.na(family_structure),
    !is.na(ace_burden_cat)
  )

cat("Complete-case N for regression:", nrow(analysis_complete), "\n")

## Complete-case N for regression: 108713

This defines a single complete-case analytic sample before regression

Building survey design

nsch_design_complete <- svydesign(
  ids     = ~HHID_F,
  strata  = ~STRATUM_F,
  weights = ~weight_pooled,
  data    = nsch_clean,
  nest    = TRUE
)

options(survey.lonely.psu = "adjust")

This ensures:

Proper clustering (household-level)
Proper stratification
Proper pooled weights (FWC/3)
Correct standard errors

STEP 2 — Unadjusted Logistic Regression

Before controlling for anything, is household language associated with impairing condition?

model_crude <- svyglm(
  impairing_mental_behav ~ household_language,
  design = nsch_design_complete,
  family = quasibinomial()
)

summary(model_crude)

## 
## Call:
## svyglm(formula = impairing_mental_behav ~ household_language, 
##     design = nsch_design_complete, family = quasibinomial())
## 
## Survey design:
## svydesign(ids = ~HHID_F, strata = ~STRATUM_F, weights = ~weight_pooled, 
##     data = nsch_clean, nest = TRUE)
## 
## Coefficients:
##                               Estimate Std. Error t value             Pr(>|t|)
## (Intercept)                   -1.31643    0.01385 -95.038 < 0.0000000000000002
## household_languageBilingual   -0.99562    0.08765 -11.359 < 0.0000000000000002
## household_languageNon-English -0.46733    0.15934  -2.933              0.00336
##                                  
## (Intercept)                   ***
## household_languageBilingual   ***
## household_languageNon-English ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.000008)
## 
## Number of Fisher Scoring iterations: 4

# Convert to odds ratios:
crude_results <- broom::tidy(model_crude) %>%
  mutate(
    OR  = exp(estimate),
    LCL = exp(estimate - 1.96 * std.error),
    UCL = exp(estimate + 1.96 * std.error)
  )

print(crude_results, width = Inf)

## # A tibble: 3 × 8
##   term                          estimate std.error statistic  p.value    OR
##   <chr>                            <dbl>     <dbl>     <dbl>    <dbl> <dbl>
## 1 (Intercept)                     -1.32     0.0139    -95.0  0        0.268
## 2 household_languageBilingual     -0.996    0.0877    -11.4  6.93e-30 0.369
## 3 household_languageNon-English   -0.467    0.159      -2.93 3.36e- 3 0.627
##     LCL   UCL
##   <dbl> <dbl>
## 1 0.261 0.275
## 2 0.311 0.439
## 3 0.459 0.856

Interpret the crude model results: Reference category is English-only households. Coefficients (log-odds scale):

Crude (Unadjusted) Results

Compared to English-only households, both language minority groups showed significantly lower odds of any impairing mental or behavioral condition before accounting for any covariates.

Bilingual households had 63% lower odds of an impairing condition (OR = 0.369, 95% CI: 0.311–0.439, p < .001). This is a large and highly significant effect even in the unadjusted model, and the confidence interval is relatively tight, suggesting a stable estimate.

Non-English only households had 37% lower odds compared to English-only households (OR = 0.627, 95% CI: 0.459–0.856, p = .003). This association is also significant but the confidence interval is wider, reflecting the smaller sample size in that group and more uncertainty around the estimate. The intercept (OR = 0.268) represents the baseline odds of an impairing condition for English-only households and is not something you would typically report or interpret directly in a paper.

The key story at this stage is that both language minority groups appear protective relative to English-only households in the unadjusted analysis, but as you already know from the adjusted model, the non-English only effect does not survive adjustment for sociodemographic covariates while the bilingual effect remains strong and actually grows slightly. That contrast between the two groups is the heart of your findin

STEP 3 — Fully Adjusted Main Effects Model

model_adj <- svyglm(
  impairing_mental_behav ~ household_language +
    child_age + child_sex + child_race +
    income_fpl + parent_education +
    family_structure + has_insurance +
    child_nativity + urban_rural + ace_burden_cat +
    survey_year,
  design = nsch_design_complete,
  family = quasibinomial()
)

summary(model_adj)

## 
## Call:
## svyglm(formula = impairing_mental_behav ~ household_language + 
##     child_age + child_sex + child_race + income_fpl + parent_education + 
##     family_structure + has_insurance + child_nativity + urban_rural + 
##     ace_burden_cat + survey_year, design = nsch_design_complete, 
##     family = quasibinomial())
## 
## Survey design:
## svydesign(ids = ~HHID_F, strata = ~STRATUM_F, weights = ~weight_pooled, 
##     data = nsch_clean, nest = TRUE)
## 
## Coefficients:
##                                                           Estimate Std. Error
## (Intercept)                                              -2.393943   0.107421
## household_languageBilingual                              -0.967033   0.105931
## household_languageNon-English                            -0.035915   0.181730
## child_age                                                 0.088717   0.003593
## child_sexFemale                                          -0.357459   0.032281
## child_raceBlack or African American alone                -0.296272   0.053768
## child_raceAmerican Indian or Alaska Native alone         -0.343839   0.123900
## child_raceAsian alone                                    -0.771184   0.091657
## child_raceNative Hawaiian / Other Pacific Islander alone -0.443445   0.264486
## child_raceTwo or more races                               0.001666   0.057381
## income_fpl100-199% FPL                                   -0.072757   0.058277
## income_fpl200-399% FPL                                   -0.216216   0.055455
## income_fpl400%+ FPL                                      -0.211976   0.056548
## parent_educationHigh school                               0.060391   0.092639
## parent_educationSome college                              0.151669   0.091648
## parent_educationBachelor's+                               0.140600   0.092414
## family_structureSingle parent                            -0.118660   0.045414
## family_structureOther                                     0.159735   0.077350
## has_insuranceUninsured                                   -0.407917   0.087302
## child_nativityForeign-born                               -0.178300   0.108271
## urban_ruralRural                                         -0.109776   0.042422
## ace_burden_cat1 ACE                                       0.547107   0.043976
## ace_burden_cat2 ACEs                                      0.913545   0.057430
## ace_burden_cat3+ ACEs                                     1.538249   0.060390
## survey_year2021                                           0.059151   0.041936
## survey_year2022                                           0.146671   0.039837
##                                                          t value
## (Intercept)                                              -22.286
## household_languageBilingual                               -9.129
## household_languageNon-English                             -0.198
## child_age                                                 24.689
## child_sexFemale                                          -11.073
## child_raceBlack or African American alone                 -5.510
## child_raceAmerican Indian or Alaska Native alone          -2.775
## child_raceAsian alone                                     -8.414
## child_raceNative Hawaiian / Other Pacific Islander alone  -1.677
## child_raceTwo or more races                                0.029
## income_fpl100-199% FPL                                    -1.248
## income_fpl200-399% FPL                                    -3.899
## income_fpl400%+ FPL                                       -3.749
## parent_educationHigh school                                0.652
## parent_educationSome college                               1.655
## parent_educationBachelor's+                                1.521
## family_structureSingle parent                             -2.613
## family_structureOther                                      2.065
## has_insuranceUninsured                                    -4.672
## child_nativityForeign-born                                -1.647
## urban_ruralRural                                          -2.588
## ace_burden_cat1 ACE                                       12.441
## ace_burden_cat2 ACEs                                      15.907
## ace_burden_cat3+ ACEs                                     25.472
## survey_year2021                                            1.411
## survey_year2022                                            3.682
##                                                                      Pr(>|t|)
## (Intercept)                                              < 0.0000000000000002
## household_languageBilingual                              < 0.0000000000000002
## household_languageNon-English                                        0.843336
## child_age                                                < 0.0000000000000002
## child_sexFemale                                          < 0.0000000000000002
## child_raceBlack or African American alone                        0.0000000359
## child_raceAmerican Indian or Alaska Native alone                     0.005519
## child_raceAsian alone                                    < 0.0000000000000002
## child_raceNative Hawaiian / Other Pacific Islander alone             0.093618
## child_raceTwo or more races                                          0.976842
## income_fpl100-199% FPL                                               0.211867
## income_fpl200-399% FPL                                           0.0000966633
## income_fpl400%+ FPL                                                  0.000178
## parent_educationHigh school                                          0.514471
## parent_educationSome college                                         0.097945
## parent_educationBachelor's+                                          0.128158
## family_structureSingle parent                                        0.008981
## family_structureOther                                                0.038915
## has_insuranceUninsured                                           0.0000029797
## child_nativityForeign-born                                           0.099604
## urban_ruralRural                                                     0.009663
## ace_burden_cat1 ACE                                      < 0.0000000000000002
## ace_burden_cat2 ACEs                                     < 0.0000000000000002
## ace_burden_cat3+ ACEs                                    < 0.0000000000000002
## survey_year2021                                                      0.158391
## survey_year2022                                                      0.000232
##                                                             
## (Intercept)                                              ***
## household_languageBilingual                              ***
## household_languageNon-English                               
## child_age                                                ***
## child_sexFemale                                          ***
## child_raceBlack or African American alone                ***
## child_raceAmerican Indian or Alaska Native alone         ** 
## child_raceAsian alone                                    ***
## child_raceNative Hawaiian / Other Pacific Islander alone .  
## child_raceTwo or more races                                 
## income_fpl100-199% FPL                                      
## income_fpl200-399% FPL                                   ***
## income_fpl400%+ FPL                                      ***
## parent_educationHigh school                                 
## parent_educationSome college                             .  
## parent_educationBachelor's+                                 
## family_structureSingle parent                            ** 
## family_structureOther                                    *  
## has_insuranceUninsured                                   ***
## child_nativityForeign-born                               .  
## urban_ruralRural                                         ** 
## ace_burden_cat1 ACE                                      ***
## ace_burden_cat2 ACEs                                     ***
## ace_burden_cat3+ ACEs                                    ***
## survey_year2021                                             
## survey_year2022                                          ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.066082)
## 
## Number of Fisher Scoring iterations: 5

ace_burden_cat (categorical) was used because I wasnted to allow non-linear effects and report ORs for each burden level vs. 0 ACEs.

Primary Exposure: Household Language Reference = English-only households

Bilingual households: OR = exp(−0.967) = 0.38 — after adjusting for all covariates including ACE burden, bilingual households have 62% lower odds of the child having an impairing mental/behavioral condition. Highly significant (p < 0.001).

Non-English households: OR = exp(−0.036) = 0.96 — essentially no difference from English-only, and non-significant (p = 0.84). This is a notable shift from the crude model where Non-English was significantly protective, suggesting confounders (likely ACE burden, income, education) were masking the true relationship.

ACE Burden (Primary Covariate of Interest) A clear, strong, dose-response gradient — every category is highly significant (all p < 0.001

#Convert to odds ratios:

Key Covariates Child characteristics:

Age: Each additional year increases odds by exp(0.089) = 1.09 (9% per year), p < 0.001 Sex: Females have 30% lower odds than males (OR = 0.70), p < 0.001 Race: Black (OR = 0.74), AIAN (OR = 0.71), and Asian (OR = 0.46) children all have significantly lower odds than White children

Socioeconomic:

Income: 200–399% FPL (OR = 0.81) and 400%+ FPL (OR = 0.81) both significantly lower odds vs. 0–99% FPL — poverty is a risk factor Parent education: No significant differences after adjustment — absorbed by ACE and income Uninsured: OR = 0.67, significantly lower odds — likely reflects underdiagnosis rather than true protection

Family/structural:

Single parent: OR = 0.89, modestly lower odds vs. two-parent (p = 0.009) Other family structure: OR = 1.17, slightly higher odds (p = 0.039) Rural: OR = 0.90, lower odds than urban (p = 0.010) — again likely reflects access/diagnosis gaps

adj_results <- broom::tidy(model_adj) %>%
  mutate(
    OR  = exp(estimate),
    LCL = exp(estimate - 1.96 * std.error),
    UCL = exp(estimate + 1.96 * std.error)
  )

adj_results

## # A tibble: 26 × 8
##    term               estimate std.error statistic   p.value     OR    LCL   UCL
##    <chr>                 <dbl>     <dbl>     <dbl>     <dbl>  <dbl>  <dbl> <dbl>
##  1 (Intercept)        -2.39      0.107    -22.3    9.00e-110 0.0913 0.0739 0.113
##  2 household_languag… -0.967     0.106     -9.13   7.04e- 20 0.380  0.309  0.468
##  3 household_languag… -0.0359    0.182     -0.198  8.43e-  1 0.965  0.676  1.38 
##  4 child_age           0.0887    0.00359   24.7    3.30e-134 1.09   1.09   1.10 
##  5 child_sexFemale    -0.357     0.0323   -11.1    1.75e- 28 0.699  0.657  0.745
##  6 child_raceBlack o… -0.296     0.0538    -5.51   3.59e-  8 0.744  0.669  0.826
##  7 child_raceAmerica… -0.344     0.124     -2.78   5.52e-  3 0.709  0.556  0.904
##  8 child_raceAsian a… -0.771     0.0917    -8.41   4.02e- 17 0.462  0.386  0.553
##  9 child_raceNative … -0.443     0.264     -1.68   9.36e-  2 0.642  0.382  1.08 
## 10 child_raceTwo or …  0.00167   0.0574     0.0290 9.77e-  1 1.00   0.895  1.12 
## # ℹ 16 more rows

# converting from log-odds coefficient to odds ratios
adj_results <- tidy(model_adj, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~round(., 3)))

print(adj_results, n = Inf, width = Inf)

## # A tibble: 26 × 7
##    term                                                     estimate std.error
##    <chr>                                                       <dbl>     <dbl>
##  1 (Intercept)                                                 0.091     0.107
##  2 household_languageBilingual                                 0.38      0.106
##  3 household_languageNon-English                               0.965     0.182
##  4 child_age                                                   1.09      0.004
##  5 child_sexFemale                                             0.699     0.032
##  6 child_raceBlack or African American alone                   0.744     0.054
##  7 child_raceAmerican Indian or Alaska Native alone            0.709     0.124
##  8 child_raceAsian alone                                       0.462     0.092
##  9 child_raceNative Hawaiian / Other Pacific Islander alone    0.642     0.264
## 10 child_raceTwo or more races                                 1.00      0.057
## 11 income_fpl100-199% FPL                                      0.93      0.058
## 12 income_fpl200-399% FPL                                      0.806     0.055
## 13 income_fpl400%+ FPL                                         0.809     0.057
## 14 parent_educationHigh school                                 1.06      0.093
## 15 parent_educationSome college                                1.16      0.092
## 16 parent_educationBachelor's+                                 1.15      0.092
## 17 family_structureSingle parent                               0.888     0.045
## 18 family_structureOther                                       1.17      0.077
## 19 has_insuranceUninsured                                      0.665     0.087
## 20 child_nativityForeign-born                                  0.837     0.108
## 21 urban_ruralRural                                            0.896     0.042
## 22 ace_burden_cat1 ACE                                         1.73      0.044
## 23 ace_burden_cat2 ACEs                                        2.49      0.057
## 24 ace_burden_cat3+ ACEs                                       4.66      0.06 
## 25 survey_year2021                                             1.06      0.042
## 26 survey_year2022                                             1.16      0.04 
##    statistic p.value conf.low conf.high
##        <dbl>   <dbl>    <dbl>     <dbl>
##  1   -22.3     0        0.074     0.113
##  2    -9.13    0        0.309     0.468
##  3    -0.198   0.843    0.676     1.38 
##  4    24.7     0        1.08      1.1  
##  5   -11.1     0        0.657     0.745
##  6    -5.51    0        0.669     0.826
##  7    -2.78    0.006    0.556     0.904
##  8    -8.41    0        0.386     0.553
##  9    -1.68    0.094    0.382     1.08 
## 10     0.029   0.977    0.895     1.12 
## 11    -1.25    0.212    0.829     1.04 
## 12    -3.90    0        0.723     0.898
## 13    -3.75    0        0.724     0.904
## 14     0.652   0.514    0.886     1.27 
## 15     1.66    0.098    0.972     1.39 
## 16     1.52    0.128    0.96      1.38 
## 17    -2.61    0.009    0.812     0.971
## 18     2.06    0.039    1.01      1.36 
## 19    -4.67    0        0.56      0.789
## 20    -1.65    0.1      0.677     1.03 
## 21    -2.59    0.01     0.825     0.974
## 22    12.4     0        1.59      1.88 
## 23    15.9     0        2.23      2.79 
## 24    25.5     0        4.14      5.24 
## 25     1.41    0.158    0.977     1.15 
## 26     3.68    0        1.07      1.25

These results were without ACE Household Language (Primary Exposure)

After adjusting for all covariates, bilingual households still show significantly lower odds of any impairing condition compared to English-only households (β = −1.10, p < .001), converting to roughly exp(−1.10) ≈ 0.33, about 67% lower odds. The non-English only group, however, is no longer significant (β = −0.25, p = .196), meaning once you account for sociodemographic factors, that group’s apparent protection disappears.

Bilingual households have an OR of 0.334, meaning about 67% lower odds of an impairing condition compared to English-only households, and that is highly significant. Non-English only comes in at 0.779 and is not significant (p = .196), confirming what we discussed earlier about confounders explaining that association.

Child Characteristics

Older children have higher odds of diagnosis, which makes sense given cumulative detection over time (β = 0.10, p < .001). Girls have significantly lower odds than boys (β = −0.34), consistent with the literature on male-predominant conditions like ADHD and autism. For race, Black, Asian, and Native Hawaiian/Pacific Islander children all show significantly lower odds compared to the White reference group, with Asian children showing the largest gap (β = −0.92).

Each additional year of age increases odds by about 11% (OR = 1.11). Girls have 29% lower odds than boys (OR = 0.713). Asian children show the largest racial gap with 60% lower odds than the White reference group (OR = 0.398), and Black children show about 28% lower odds (OR = 0.719).

Socioeconomic Factors

Higher income was protective. Compared to families below 100% FPL, those at 200–399% FPL had 24% lower odds (OR = 0.762, 95% CI: 0.684–0.848, p < .001) and those at 400%+ FPL had 31% lower odds (OR = 0.690, 95% CI: 0.619–0.770, p < .001). The 100–199% FPL group was not significantly different from the lowest income group (OR = 0.938, 95% CI: 0.837–1.05, p = .269), suggesting the protective income gradient really begins above 200% FPL. For parental education, only the some college group differed significantly from the less than high school reference, with 22% higher odds (OR = 1.22, 95% CI: 1.02–1.46, p = .032). High school and bachelor’s and above were not significant, which is a somewhat unusual pattern worth acknowledging.

Family Structure This was one of the stronger predictors in the model. Single parent households had 41% higher odds compared to two parent households (OR = 1.41, 95% CI: 1.31–1.52, p < .001). The other family structure category, which likely includes grandparent-headed or other non-traditional arrangements, showed more than double the odds (OR = 2.11, 95% CI: 1.84–2.42, p < .001).

Insurance, Nativity, and Urbanicity Uninsured children had 37% lower odds of a diagnosed impairing condition (OR = 0.634, 95% CI: 0.531–0.757, p < .001). This almost certainly reflects reduced healthcare access and underdiagnosis rather than true lower burden, and is worth flagging as a limitation. Foreign-born children and rural residence were both non-significant after adjustment. Survey year 2022 showed modestly higher odds compared to 2020 (OR = 1.15, 95% CI: 1.07–1.25, p < .001), possibly reflecting pandemic-related diagnostic trends or survey changes across years.

Meaning

In the crude model, the bilingual coefficient was −0.996. In the adjusted model, it is −1.097. That is a slightly larger negative number, meaning the estimated protective effect got a little bigger after adding all the covariates, not smaller.

Compare that to the non-English group, where the coefficient went from −0.467 (significant) in the crude model to −0.249 (not significant) in the adjusted model. That one did attenuate, meaning confounders were partly explaining that association. The bilingual group behaved differently, and that contrast is worth highlighting in your results.

STEP 4 — Test Effect Modification (Interaction Models)

Three moderators:

• Urbanicity • Caregiver mental health • ACE burden

#4A — Language × Urbanicity

# 4A — Language × Urbanicity

model_int_urban <- svyglm(
  impairing_mental_behav ~ household_language * urban_rural +
    child_age + child_sex + child_race +
    income_fpl + parent_education + ace_burden_cat +
    family_structure + has_insurance +
    child_nativity + survey_year,
  design = nsch_design_complete,
  family = quasibinomial()
)

regTermTest(model_int_urban, ~ household_language:urban_rural)

## Wald test for household_language:urban_rural
##  in svyglm(formula = impairing_mental_behav ~ household_language * 
##     urban_rural + child_age + child_sex + child_race + income_fpl + 
##     parent_education + ace_burden_cat + family_structure + has_insurance + 
##     child_nativity + survey_year, design = nsch_design_complete, 
##     family = quasibinomial())
## F =  0.2513849  on  2  and  108687  df: p= 0.77772

The interaction between household language and urban/rural residence is not significant (F = 0.21, p = .809). This means the relationship between household language and impairing conditions does not meaningfully differ between urban and rural settings. The language effect is essentially the same regardless of where the family lives, so you do not need to retain this interaction term in your model.

#4B — Language × Caregiver Mental Health

model_int_caregiver <- svyglm(
  impairing_mental_behav ~ household_language * caregiver_mental_health +
    child_age + child_sex + child_race +
    income_fpl + parent_education + ace_burden_cat +
    family_structure + has_insurance +
    child_nativity + urban_rural + survey_year,
  design = nsch_design_complete,
  family = quasibinomial()
)

regTermTest(model_int_caregiver, ~ household_language:caregiver_mental_health)

## Wald test for household_language:caregiver_mental_health
##  in svyglm(formula = impairing_mental_behav ~ household_language * 
##     caregiver_mental_health + child_age + child_sex + child_race + 
##     income_fpl + parent_education + ace_burden_cat + family_structure + 
##     has_insurance + child_nativity + urban_rural + survey_year, 
##     design = nsch_design_complete, family = quasibinomial())
## F =  1.19102  on  2  and  107999  df: p= 0.30392

Another clean null result — F = 1.19, df = 2, p = 0.304.

Interpretation The association between household language and child mental/behavioral conditions does not differ by caregiver mental health status. The bilingual protective effect is roughly the same regardless of whether the caregiver has good or poor mental health. This is actually theoretically interesting, you might have expected that poor caregiver mental health would erode the bilingual protective effect (stressed caregivers less able to maintain cultural/linguistic protective practices), but the data don’t support that.

#4C — Language × ACE Burden

model_int_ace <- svyglm(
  impairing_mental_behav ~ household_language * ace_burden_cat +
    child_age + child_sex + child_race +
    income_fpl + parent_education +
    family_structure + has_insurance +
    child_nativity + urban_rural + survey_year,
  design = nsch_design_complete,
  family = quasibinomial()
)

regTermTest(model_int_ace, ~ household_language:ace_burden_cat)

## Wald test for household_language:ace_burden_cat
##  in svyglm(formula = impairing_mental_behav ~ household_language * 
##     ace_burden_cat + child_age + child_sex + child_race + income_fpl + 
##     parent_education + family_structure + has_insurance + child_nativity + 
##     urban_rural + survey_year, design = nsch_design_complete, 
##     family = quasibinomial())
## F =  0.3699278  on  6  and  108683  df: p= 0.89844

Interpretation The protective effect of bilingual/non-English household language on child mental/behavioral conditions does not vary across ACE burden levels. A bilingual family with 3+ ACEs has roughly the same language-associated protection as a bilingual family with 0 ACEs. ACE burden and household language operate as independent, additive risk/protective factors rather than interacting ones. This is actually a meaningful finding, it suggests the bilingual protective effect is robust across adversity levels, not contingent on having a low-stress household environment.

#STEP 5 — Stratified Models (If Interaction Significant)

model_urban <- svyglm(
  impairing_mental_behav ~ household_language +
    child_age + child_sex + child_race +
    income_fpl + parent_education + ace_burden_cat +
    family_structure + has_insurance +
    child_nativity + survey_year,
  design = subset(nsch_design_complete, urban_rural == "Urban"),
  family = quasibinomial()
)

model_urban

## Stratified Independent Sampling design (with replacement)
## subset(nsch_design_complete, urban_rural == "Urban")
## 
## Call:  svyglm(formula = impairing_mental_behav ~ household_language + 
##     child_age + child_sex + child_race + income_fpl + parent_education + 
##     ace_burden_cat + family_structure + has_insurance + child_nativity + 
##     survey_year, design = subset(nsch_design_complete, urban_rural == 
##     "Urban"), family = quasibinomial())
## 
## Coefficients:
##                                              (Intercept)  
##                                                 -2.47333  
##                              household_languageBilingual  
##                                                 -0.96418  
##                            household_languageNon-English  
##                                                 -0.00369  
##                                                child_age  
##                                                  0.08952  
##                                          child_sexFemale  
##                                                 -0.33062  
##                child_raceBlack or African American alone  
##                                                 -0.32598  
##         child_raceAmerican Indian or Alaska Native alone  
##                                                 -0.45690  
##                                    child_raceAsian alone  
##                                                 -0.77526  
## child_raceNative Hawaiian / Other Pacific Islander alone  
##                                                 -0.50117  
##                              child_raceTwo or more races  
##                                                 -0.04246  
##                                   income_fpl100-199% FPL  
##                                                 -0.03190  
##                                   income_fpl200-399% FPL  
##                                                 -0.20716  
##                                      income_fpl400%+ FPL  
##                                                 -0.17830  
##                              parent_educationHigh school  
##                                                  0.12109  
##                             parent_educationSome college  
##                                                  0.19462  
##                              parent_educationBachelor's+  
##                                                  0.19436  
##                                      ace_burden_cat1 ACE  
##                                                  0.56558  
##                                     ace_burden_cat2 ACEs  
##                                                  0.90463  
##                                    ace_burden_cat3+ ACEs  
##                                                  1.55874  
##                            family_structureSingle parent  
##                                                 -0.12762  
##                                    family_structureOther  
##                                                  0.16265  
##                                   has_insuranceUninsured  
##                                                 -0.39013  
##                               child_nativityForeign-born  
##                                                 -0.18485  
##                                          survey_year2021  
##                                                  0.05060  
##                                          survey_year2022  
##                                                  0.13594  
## 
## Degrees of Freedom: 89068 Total (i.e. Null);  89043 Residual
##   (3267 observations deleted due to missingness)
## Null Deviance:       87090 
## Residual Deviance: 78520     AIC: NA

model_rural <- svyglm(
  impairing_mental_behav ~ household_language +
    child_age + child_sex + child_race +
    income_fpl + parent_education + ace_burden_cat +
    family_structure + has_insurance +
    child_nativity + survey_year,
  design = subset(nsch_design_complete, urban_rural == "Rural"),
  family = quasibinomial()
)

model_rural

## Stratified Independent Sampling design (with replacement)
## subset(nsch_design_complete, urban_rural == "Rural")
## 
## Call:  svyglm(formula = impairing_mental_behav ~ household_language + 
##     child_age + child_sex + child_race + income_fpl + parent_education + 
##     ace_burden_cat + family_structure + has_insurance + child_nativity + 
##     survey_year, design = subset(nsch_design_complete, urban_rural == 
##     "Rural"), family = quasibinomial())
## 
## Coefficients:
##                                              (Intercept)  
##                                                -2.180922  
##                              household_languageBilingual  
##                                                -0.863381  
##                            household_languageNon-English  
##                                                -0.174962  
##                                                child_age  
##                                                 0.085068  
##                                          child_sexFemale  
##                                                -0.534767  
##                child_raceBlack or African American alone  
##                                                -0.042386  
##         child_raceAmerican Indian or Alaska Native alone  
##                                                 0.046324  
##                                    child_raceAsian alone  
##                                                -1.030501  
## child_raceNative Hawaiian / Other Pacific Islander alone  
##                                                 0.207662  
##                              child_raceTwo or more races  
##                                                 0.366579  
##                                   income_fpl100-199% FPL  
##                                                -0.244440  
##                                   income_fpl200-399% FPL  
##                                                -0.218806  
##                                      income_fpl400%+ FPL  
##                                                -0.473180  
##                              parent_educationHigh school  
##                                                -0.194453  
##                             parent_educationSome college  
##                                                -0.007770  
##                              parent_educationBachelor's+  
##                                                -0.107434  
##                                      ace_burden_cat1 ACE  
##                                                 0.426246  
##                                     ace_burden_cat2 ACEs  
##                                                 0.972062  
##                                    ace_burden_cat3+ ACEs  
##                                                 1.441448  
##                            family_structureSingle parent  
##                                                -0.066445  
##                                    family_structureOther  
##                                                 0.158432  
##                                   has_insuranceUninsured  
##                                                -0.512563  
##                               child_nativityForeign-born  
##                                                -0.006018  
##                                          survey_year2021  
##                                                 0.131979  
##                                          survey_year2022  
##                                                 0.222544  
## 
## Degrees of Freedom: 19646 Total (i.e. Null);  19621 Residual
##   (697 observations deleted due to missingness)
## Null Deviance:       20170 
## Residual Deviance: 18010     AIC: NA

Urban and Rural Odds Ratios (OR)

# Urban ORs
urban_results <- tidy(model_urban) %>%
  mutate(
    OR  = exp(estimate),
    LCL = exp(estimate - 1.96 * std.error),
    UCL = exp(estimate + 1.96 * std.error),
    stratum = "Urban"
  )

# Rural ORs
rural_results <- tidy(model_rural) %>%
  mutate(
    OR  = exp(estimate),
    LCL = exp(estimate - 1.96 * std.error),
    UCL = exp(estimate + 1.96 * std.error),
    stratum = "Rural"
  )

# Combined
stratified_results <- bind_rows(urban_results, rural_results) %>%
  select(stratum, term, OR, LCL, UCL, p.value) %>%
  filter(grepl("household_language|ace_burden", term)) %>%  # focus on key terms
  mutate(across(c(OR, LCL, UCL), ~round(., 2)),
         p.value = round(p.value, 3))

print(stratified_results, n = Inf)

## # A tibble: 10 × 6
##    stratum term                             OR   LCL   UCL p.value
##    <chr>   <chr>                         <dbl> <dbl> <dbl>   <dbl>
##  1 Urban   household_languageBilingual    0.38  0.31  0.47   0    
##  2 Urban   household_languageNon-English  1     0.69  1.44   0.984
##  3 Urban   ace_burden_cat1 ACE            1.76  1.6   1.93   0    
##  4 Urban   ace_burden_cat2 ACEs           2.47  2.19  2.79   0    
##  5 Urban   ace_burden_cat3+ ACEs          4.75  4.16  5.43   0    
##  6 Rural   household_languageBilingual    0.42  0.21  0.85   0.015
##  7 Rural   household_languageNon-English  0.84  0.21  3.37   0.805
##  8 Rural   ace_burden_cat1 ACE            1.53  1.25  1.88   0    
##  9 Rural   ace_burden_cat2 ACEs           2.64  2     3.5    0    
## 10 Rural   ace_burden_cat3+ ACEs          4.23  3.3   5.41   0

Household Language Effects by Stratum Bilingual vs. English-only:

Urban: OR = 0.38 (95% CI: 0.31–0.47), p < 0.001; highly precise, very strong protective effect Rural: OR = 0.42 (95% CI: 0.21–0.85), p = 0.015; same direction, significant, but much wider CI reflecting the small rural bilingual sample

The point estimates are nearly identical (0.38 vs. 0.42) and the CIs overlap substantially — this is exactly why the interaction test was null. The bilingual protective effect holds in both settings. Non-English vs. English-only:

Urban: OR = 1.00 (95% CI: 0.69–1.44), p = 0.984; completely null Rural: OR = 0.84 (95% CI: 0.21–3.37), p = 0.805;null, and the CI spans 0.21 to 3.37, which is essentially uninformative

The rural Non-English CI width (3.16 units wide) signals serious sample size limitations for that cell. Any interpretation there should be treated with extreme caution.

Stratified analyses revealed that the protective association of bilingual household language was consistent across urban (OR = 0.38, 95% CI: 0.31–0.47) and rural settings (OR = 0.42, 95% CI: 0.21–0.85), with overlapping confidence intervals supporting the null interaction finding (p = 0.78). The Non-English household effect was null in both strata, though rural estimates were imprecise due to small cell sizes. ACE burden demonstrated a consistent dose-response gradient in both urban and rural settings, with children in households reporting 3+ ACEs facing approximately 4–5 times the odds of impairing mental/behavioral conditions compared to those with no ACEs.

STEP 6 — Multicollinearity Check

# Note: VIF uses unweighted glm as car::vif does not support svyglm objects.
# VIF assesses design matrix collinearity only — survey weights do not affect this.

model_check <- glm(
  impairing_mental_behav ~ household_language +
    child_age + child_sex + child_race +
    income_fpl + parent_education +
    family_structure + has_insurance +
    child_nativity + urban_rural +
    ace_burden_cat +                   
    survey_year,
  data   = nsch_clean,
  family = binomial()
)

car::vif(model_check)

##                        GVIF Df GVIF^(1/(2*Df))
## household_language 1.190567  2        1.044572
## child_age          1.053094  1        1.026204
## child_sex          1.001753  1        1.000876
## child_race         1.177561  5        1.016479
## income_fpl         1.475979  3        1.067038
## parent_education   1.423462  3        1.060615
## family_structure   1.567192  2        1.118872
## has_insurance      1.021808  1        1.010845
## child_nativity     1.110033  1        1.053581
## urban_rural        1.051599  1        1.025475
## ace_burden_cat     1.541621  3        1.074805
## survey_year        1.010097  2        1.002515

VIF > 5 → concern VIF > 10 → serious problem

Generalized variance inflation factors (GVIF) were examined for all model covariates using an unweighted logistic regression on the analytic sample. All GVIF^(1/(2·Df)) values were below 1.12, indicating no evidence of multicollinearity.

The highest value is family_structure at 1.12, which is negligible. Even variables you might expect to be correlated, income_fpl, parent_education, has_insurance, and ace_burden_cat, show virtually no inflation, confirming they are each contributing independent information to the model.

What We Do Next

You now:

Run crude model

Run adjusted model

Post the OR table

Then I will:

Help you interpret the ORs clearly

Write Section 5 in journal-ready language

Help you determine whether your immigrant paradox hypothesis is supported

This is where your paper becomes analytical rather than descriptive.

Part 3: Results Tables

Converting the crude and adjusted models into clean OR tables with 95% CIs and p-values

# --- Crude model table ---
tbl_crude <- tbl_regression(
  model_crude,
  exponentiate = TRUE,
  label = list(
    household_language ~ "Household language"
  )
) %>%
  modify_header(estimate = "**OR**") %>%
  add_significance_stars(
    hide_ci = FALSE,
    hide_p  = FALSE
  )

# --- Adjusted model table ---
tbl_adj <- tbl_regression(
  model_adj,
  exponentiate = TRUE,
  label = list(
    household_language  ~ "Household language",
    child_age           ~ "Child age (years)",
    child_sex           ~ "Child sex",
    child_race          ~ "Child race",
    income_fpl          ~ "Household income (% FPL)",
    parent_education    ~ "Primary caregiver education",
    family_structure    ~ "Family structure",
    has_insurance       ~ "Health insurance status",
    child_nativity      ~ "Child nativity",
    urban_rural         ~ "Urbanicity",
    ace_burden_cat      ~ "ACE burden",
    survey_year         ~ "Survey year"
  )
) %>%
  modify_header(estimate = "**aOR**") %>%
  add_significance_stars(
    hide_ci = FALSE,
    hide_p  = FALSE
  )

# --- Merge crude and adjusted side by side ---
tbl_merged <- tbl_merge(
  tbls        = list(tbl_crude, tbl_adj),
  tab_spanner = c("**Crude**", "**Adjusted**"),
  quiet       = TRUE
) %>%
  modify_caption(
    "**Table 2. Crude and Adjusted Odds Ratios for Any Impairing Mental/Behavioral Condition**"
  )

# --- Render and save ---
tbl_merged %>%
  as_gt() %>%
  gt::tab_source_note(
    source_note = gt::md(
      "OR = odds ratio; aOR = adjusted odds ratio; CI = 95% confidence interval. Models estimated using survey-weighted quasibinomial regression accounting for NSCH complex sampling design.  
      Reference categories: household language = English only; child sex = Male; child race = White alone; income = 0–99% FPL; parent education = Less than HS; family structure = Two parents; insurance = Insured; nativity = US-born; urbanicity = Urban; ACE burden = 0 ACEs; survey year = 2020.  
      *p<0.05; **p<0.01; ***p<0.001"
    )
  ) %>%
  gt::tab_options(
    table.font.size         = 12,
    heading.title.font.size = 14,
    data_row.padding        = gt::px(4)
  ) %>%
  gt::gtsave(file = file.path(project_folder, "outputs/tables/table2_crude_adjusted.html"))

tbl_merged

**Table 2. Crude and Adjusted Odds Ratios for Any Impairing Mental/Behavioral Condition**
Characteristic	Crude				Adjusted
Characteristic	OR¹	SE	95% CI	p-value	aOR¹	SE	95% CI	p-value
Household language
English only	—	—	—		—	—	—
Bilingual	0.37***	0.088	0.31, 0.44	<0.001	0.38***	0.106	0.31, 0.47	<0.001
Non-English	0.63**	0.159	0.46, 0.86	0.003	0.96	0.182	0.68, 1.38	0.8
Child age (years)					1.09***	0.004	1.09, 1.10	<0.001
Child sex
Male					—	—	—
Female					0.70***	0.032	0.66, 0.75	<0.001
Child race
White alone					—	—	—
Black or African American alone					0.74***	0.054	0.67, 0.83	<0.001
American Indian or Alaska Native alone					0.71**	0.124	0.56, 0.90	0.006
Asian alone					0.46***	0.092	0.39, 0.55	<0.001
Native Hawaiian / Other Pacific Islander alone					0.64	0.264	0.38, 1.08	0.094
Two or more races					1.00	0.057	0.90, 1.12	>0.9
Household income (% FPL)
0-99% FPL					—	—	—
100-199% FPL					0.93	0.058	0.83, 1.04	0.2
200-399% FPL					0.81***	0.055	0.72, 0.90	<0.001
400%+ FPL					0.81***	0.057	0.72, 0.90	<0.001
Primary caregiver education
Less than HS					—	—	—
High school					1.06	0.093	0.89, 1.27	0.5
Some college					1.16	0.092	0.97, 1.39	0.10
Bachelor's+					1.15	0.092	0.96, 1.38	0.13
Family structure
Two parents					—	—	—
Single parent					0.89**	0.045	0.81, 0.97	0.009
Other					1.17*	0.077	1.01, 1.37	0.039
Health insurance status
Insured					—	—	—
Uninsured					0.67***	0.087	0.56, 0.79	<0.001
Child nativity
US-born					—	—	—
Foreign-born					0.84	0.108	0.68, 1.03	0.10
Urbanicity
Urban					—	—	—
Rural					0.90**	0.042	0.82, 0.97	0.010
ACE burden
0 ACEs					—	—	—
1 ACE					1.73***	0.044	1.59, 1.88	<0.001
2 ACEs					2.49***	0.057	2.23, 2.79	<0.001
3+ ACEs					4.66***	0.060	4.14, 5.24	<0.001
Survey year
2020					—	—	—
2021					1.06	0.042	0.98, 1.15	0.2
2022					1.16***	0.040	1.07, 1.25	<0.001
¹ p<0.05; p<0.01; **p<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio, SE = Standard Error

Primary Exposure: Household Language The crude-to-adjusted comparison tells an important story:

Bilingual vs. English-only: OR barely changes from crude (0.37) to adjusted (0.38), the protective effect is robust and not explained by any of the covariates. Bilingual households have 62% lower odds of the child having an impairing condition. Non-English vs. English-only: OR shifts dramatically from 0.63 (crude, significant) to 0.96 (adjusted, p = 0.80), the apparent crude protection completely disappears after adjustment. This tells you the crude association was entirely confounded, most likely by the younger age profile, lower income, and lower ACE burden of Non-English households.

ACE Burden — Strongest Effect in the Model The dose-response gradient is striking and all highly significant:

This is the largest effect in the entire model, larger even than the language effect, and follows a textbook dose-response patter

Child Characteristics

Age: Each additional year increases odds by 9% (aOR = 1.09), older children are more likely to have a diagnosed condition, reflecting cumulative detection over time Sex: Girls have 30% lower odds than boys (aOR = 0.70), consistent with the well-established male predominance in neurodevelopmental and externalizing conditions Race: Black (aOR = 0.74), AIAN (aOR = 0.71), and Asian (aOR = 0.46) children all have significantly lower odds than White children, likely reflecting diagnostic and access disparities rather than true lower prevalence

Socioeconomic Factors

Income: Only the higher FPL categories (200–399% and 400%+) are significant, both with aOR = 0.81; moderate and high income confers modest protection vs. poverty Parent education: No significant effects after adjustment, education’s influence is likely mediated through income and ACE burden Insurance: Uninsured children have 33% lower odds (aOR = 0.67), almost certainly reflects underdiagnosis rather than true protection Family structure: Single parent households have slightly lower odds (aOR = 0.89), counterintuitive, possibly reflecting similar underdiagnosis dynamics Rural: Lower odds than urban (aOR = 0.90), again likely a detection/access artifact

Survey Year

2022 shows modestly higher odds than 2020 (aOR = 1.16), could reflect true increases in child mental health conditions post-pandemic, or improved detection/reporting over time

After full adjustment, bilingual household language remained strongly associated with lower odds of impairing mental/behavioral conditions (aOR = 0.38, 95% CI: 0.31–0.47), while the crude association for Non-English households was entirely attenuated (aOR = 0.96, 95% CI: 0.68–1.38), suggesting confounding by demographic factors. ACE burden demonstrated the strongest independent association in the model, with children in households reporting 3 or more ACEs facing over four times the odds of impairing conditions compared to those with no ACEs (aOR = 4.66, 95% CI: 4.14–5.24).

Household Language and Mental/Behavioral Outcomes in Children

Marina Frimpong

2026 February 20