# =============================================================================
# SECTION 0: LOAD REQUIRED PACKAGES
# =============================================================================
# Install any missing packages before loading
required_pkgs <- c(
  "tidyverse", "ggplot2", "dplyr", "tidyr", "knitr", "kableExtra",
  "broom", "scales", "readr", "gridExtra", "gtsummary", "gt",
  "dotwhisker", "ggrepel", "patchwork", "ggfortify"
)
new_pkgs <- required_pkgs[!(required_pkgs %in% rownames(installed.packages()))]
if (length(new_pkgs)) install.packages(new_pkgs, repos = "https://cran.rstudio.com/")

library(tidyverse)
library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)
library(kableExtra)
library(broom)
library(scales)
library(readr)
library(gridExtra)
library(gtsummary)
library(gt)
library(dotwhisker)
library(ggrepel)
library(patchwork)
library(ggfortify)

cat("R version:", R.version$major, ".", R.version$minor, "\n")
## R version: 4 . 5.3
cat("Packages loaded successfully.\n")
## Packages loaded successfully.

Section 1: Analytical Sample Update

# =============================================================================
# SECTION 1: LOAD DATA
# =============================================================================
# Data source: Hoyert, D.L. (2025). Maternal Mortality Rates in the United
#   States, 2023. NCHS Health E-Stats. DOI: 10.15620/cdc/174577
#
df_raw <- read.csv("/Users/emmanuelsmac/Desktop/maternal_mortality_2023_NCHS.csv", stringsAsFactors = FALSE)

cat("=== RAW CSV STRUCTURE ===\n")
## === RAW CSV STRUCTURE ===
cat("Rows:", nrow(df_raw), "| Columns:", ncol(df_raw), "\n")
## Rows: 27 | Columns: 9
cat("Column names:", paste(names(df_raw), collapse = ", "), "\n\n")
## Column names: source, dataset, year, category, group, live_births, maternal_deaths, rate_per_100k_live_births, notes
print(df_raw)
##                                                                                          source
## 1                                                                                          NCHS
## 2                                                                                          NCHS
## 3                                                                                          NCHS
## 4                                                                                          NCHS
## 5                                                                                          NCHS
## 6                                                                                          NCHS
## 7                                                                                          NCHS
## 8  Increase from 2022 not statistically significant; significantly higher than all other groups
## 9                                                                                          NCHS
## 10                                                                    Prior year for comparison
## 11                                                                                         NCHS
## 12                                                 Statistically significant decrease from 2022
## 13                                                                                         NCHS
## 14                                                                    Prior year for comparison
## 15                                                                                         NCHS
## 16                                                 Statistically significant decrease from 2022
## 17                                                                                         NCHS
## 18                                                                    Prior year for comparison
## 19                                                                                         NCHS
## 20                                             Decrease from 2022 not statistically significant
## 21                                                                                         NCHS
## 22                                                                    Prior year for comparison
## 23                                                                                         NCHS
## 24                                                                                         NCHS
## 25                                                 Statistically significant decrease from 2022
## 26                                                                                         NCHS
## 27           Statistically significant decrease from 2022; nearly 5x higher than under 25 group
##                             dataset year       category              group
## 1  National Vital Statistics System 2023        Overall                All
## 2  National Vital Statistics System 2022        Overall                All
## 3  National Vital Statistics System 2021        Overall                All
## 4  National Vital Statistics System 2020        Overall                All
## 5  National Vital Statistics System 2019        Overall                All
## 6  National Vital Statistics System 2018        Overall                All
## 7  National Vital Statistics System 2023 Race/Ethnicity Black non-Hispanic
## 8                                     NA                                  
## 9  National Vital Statistics System 2022 Race/Ethnicity Black non-Hispanic
## 10                                    NA                                  
## 11 National Vital Statistics System 2023 Race/Ethnicity White non-Hispanic
## 12                                    NA                                  
## 13 National Vital Statistics System 2022 Race/Ethnicity White non-Hispanic
## 14                                    NA                                  
## 15 National Vital Statistics System 2023 Race/Ethnicity           Hispanic
## 16                                    NA                                  
## 17 National Vital Statistics System 2022 Race/Ethnicity           Hispanic
## 18                                    NA                                  
## 19 National Vital Statistics System 2023 Race/Ethnicity Asian non-Hispanic
## 20                                    NA                                  
## 21 National Vital Statistics System 2022 Race/Ethnicity Asian non-Hispanic
## 22                                    NA                                  
## 23 National Vital Statistics System 2023      Age Group           Under 25
## 24 National Vital Statistics System 2023      Age Group              25-39
## 25                                    NA                                  
## 26 National Vital Statistics System 2023      Age Group       40 and older
## 27                                    NA                                  
##    live_births maternal_deaths rate_per_100k_live_births
## 1      3594612             669                      18.6
## 2      3667758             817                      22.3
## 3      3664292            1205                      32.9
## 4      3613647             861                      23.8
## 5      3745540             754                      20.1
## 6      3791712             658                      17.4
## 7           NA              NA                        NA
## 8           NA              NA                        NA
## 9           NA              NA                        NA
## 10          NA              NA                        NA
## 11          NA              NA                        NA
## 12          NA              NA                        NA
## 13          NA              NA                        NA
## 14          NA              NA                        NA
## 15          NA              NA                        NA
## 16          NA              NA                        NA
## 17          NA              NA                        NA
## 18          NA              NA                        NA
## 19          NA              NA                        NA
## 20          NA              NA                        NA
## 21          NA              NA                        NA
## 22          NA              NA                        NA
## 23          NA              NA                        NA
## 24          NA              NA                        NA
## 25          NA              NA                        NA
## 26          NA              NA                        NA
## 27          NA              NA                        NA
##                                                        notes
## 1  Statistically significant decrease from 2022 rate of 22.3
## 2                                  Prior year for comparison
## 3                                  Prior year for comparison
## 4                                  Prior year for comparison
## 5                                  Prior year for comparison
## 6                                  Prior year for comparison
## 7                                                       50.3
## 8                                                           
## 9                                                       49.5
## 10                                                          
## 11                                                      14.5
## 12                                                          
## 13                                                      19.0
## 14                                                          
## 15                                                      12.4
## 16                                                          
## 17                                                      16.9
## 18                                                          
## 19                                                      10.7
## 20                                                          
## 21                                                      13.2
## 22                                                          
## 23                                                      12.5
## 24                                                      18.1
## 25                                                          
## 26                                                      59.8
## 27
# =============================================================================
# THREE ANALYTIC SUBFILES — unchanged from Check-in 1
# =============================================================================

# --- Subfile 1: Overall trend (2018–2023, n = 6) ---
overall <- df_raw %>%
  filter(category == "Overall") %>%
  select(year, rate_per_100k_live_births, maternal_deaths, live_births) %>%
  rename(rate = rate_per_100k_live_births) %>%
  mutate(
    year           = as.integer(year),
    rate           = as.numeric(rate),
    maternal_deaths = as.integer(maternal_deaths),
    live_births    = as.integer(live_births)
  )
cat("Subfile 1 (overall trend): n =", nrow(overall), "rows\n")
## Subfile 1 (overall trend): n = 6 rows
# --- Subfile 2: Race/ethnicity (2022 & 2023, n = 8) ---
# Rates hardcoded from Hoyert (2025) Table 1 (CSV parsing artifact documented
# in Check-in 1, Section 1.3)
race_verified <- data.frame(
  race      = factor(
    c("Black non-Hispanic", "White non-Hispanic", "Hispanic", "Asian non-Hispanic"),
    levels = c("Black non-Hispanic", "White non-Hispanic", "Hispanic", "Asian non-Hispanic")
  ),
  rate_2022 = c(49.5, 19.0, 16.9, 13.2),
  rate_2023 = c(50.3, 14.5, 12.4, 10.7)
)

# Long format for modeling
race_long <- race_verified %>%
  pivot_longer(cols = c(rate_2022, rate_2023),
               names_to = "year_label",
               values_to = "rate") %>%
  mutate(
    year      = as.integer(ifelse(year_label == "rate_2022", 2022, 2023)),
    year_c    = year - 2022,       # Center year: 0 = 2022, 1 = 2023
    race_char = as.character(race) # Character version for some functions
  )
cat("Subfile 2 (race/ethnicity long): n =", nrow(race_long), "rows\n")
## Subfile 2 (race/ethnicity long): n = 8 rows
# --- Subfile 3: Age group (2023 only, n = 3) ---
age_verified <- data.frame(
  age_group = factor(
    c("Under 25", "25–39", "40 and older"),
    levels = c("Under 25", "25–39", "40 and older")
  ),
  rate_2023   = c(12.5, 18.1, 59.8),
  age_numeric = c(20, 32, 45)     # Midpoint approximation for trend model
)
cat("Subfile 3 (age group, 2023): n =", nrow(age_verified), "rows\n")
## Subfile 3 (age group, 2023): n = 3 rows
# Key constants
national_avg_2023 <- 18.6
black_rate_2023   <- 50.3

Analytical Sample Confirmation

The analytical sample is unchanged from Check-in 1. The dataset consists of publicly available aggregate summary statistics from the NCHS National Vital Statistics System (NVSS) maternal mortality report (Hoyert, 2025), structured into three analytic subfiles:

  • Subfile 1 (Overall trend): n = 6 annual aggregate observations, 2018–2023.
  • Subfile 2 (Race/ethnicity): n = 8 group-year cells (4 racial/ethnic groups × 2 years: 2022 and 2023).
  • Subfile 3 (Age group): n = 3 age-group observations, 2023 only.

Total analytic records: N = 17 rows. No rows were added, removed, or recoded since Check-in 1. The CSV parsing artifact affecting race/ethnicity rate values (documented in Check-in 1, Section 1.3) was resolved by hardcoding verified rates from Hoyert (2025), Table 1. No new variables were added; year_c (year centered at 2022) and age_numeric (age-group midpoint) are new derived variables constructed solely for regression modeling in this check-in.

Data limitation acknowledged: Because the NCHS publishes only aggregate statistics — not individual-level records — the analytic N reflects the number of published summary cells, not individual births or deaths. This places important constraints on regression modeling, which are explicitly addressed in Sections 2 and 4.


Section 2: Regression Model Specification

Model Type Justification

# =============================================================================
# MODEL SELECTION RATIONALE
# =============================================================================
# Three separate regression analyses are conducted, each matched to a
# specific sub-aim and dataset:
#
#  Model A: Linear regression — rate ~ year (Subfile 1, n = 6)
#           Outcome: continuous rate; justifies lm()
#
#  Model B: Linear regression — rate ~ race (Subfile 2, n = 8)
#           Outcome: continuous rate; exposure is categorical (4 levels)
#           Primary inference model for racial disparity quantification
#
#  Model C: Linear regression — rate ~ age_numeric (Subfile 3, n = 3)
#           Outcome: continuous rate; exposure is ordinal age group
#
# Logistic regression is NOT appropriate: the outcome (maternal mortality
# rate per 100,000 live births) is a continuous published rate, not a
# binary indicator.
#
# Survey-weighted regression (svyglm) is NOT applicable: these are fully
# enumerated vital statistics records (100% registration), not survey samples.
cat("Model type: Linear regression (lm()) for all three models.\n")
## Model type: Linear regression (lm()) for all three models.
cat("Justification: Outcome is a continuous published mortality rate.\n")
## Justification: Outcome is a continuous published mortality rate.
cat("Logistic regression is inappropriate — outcome is not binary.\n")
## Logistic regression is inappropriate — outcome is not binary.
cat("Survey weighting is not applicable — data are vital statistics counts.\n")
## Survey weighting is not applicable — data are vital statistics counts.

The outcome variable — maternal mortality rate per 100,000 live births — is continuous and unbounded below, satisfying the scale requirements for ordinary least squares (OLS) linear regression (lm()). Logistic regression (glm(family = binomial)) is inappropriate because the outcome is not binary. Survey-weighted regression (svyglm()) is unnecessary because the NCHS NVSS data represent complete vital statistics registration, not a probability sample. I just had to mention this because check-in 2 required this meanwhile after working on this results are mentioned below for your review:

Three regression models are fit, each addressing a distinct research objective:

Model Subfile Outcome Primary Predictor N
Model A1 (unadjusted) Overall Rate Year (linear) 6
Model A2 (sensitivity) Overall ex-2021 Rate Year (linear) 5
Model B1 (unadjusted) Race/ethnicity Rate Race only 8
Model B2 (adjusted) Race/ethnicity Rate Race + Year 8
Model C (unadjusted) Age group Rate Age midpoint 3

Regression Equations

Model A — Secular trend (overall rate ~ year):

\[\text{Rate}_t = \beta_0 + \beta_1 \cdot \text{Year}_t + \varepsilon_t\] where Rate\(_t\) is the national maternal mortality rate per 100,000 live births in year \(t\), and Year is continuous (2018–2023). A sensitivity analysis repeats this model excluding 2021 (the COVID-19 pandemic peak).

Model B — Racial disparity (primary inference model):

Unadjusted: \[\text{Rate}_{ij} = \beta_0 + \beta_1 \cdot \mathbf{1}[\text{White}]_{ij} + \beta_2 \cdot \mathbf{1}[\text{Hispanic}]_{ij} + \beta_3 \cdot \mathbf{1}[\text{Asian}]_{ij} + \varepsilon_{ij}\]

Adjusted (race + year): \[\text{Rate}_{ij} = \beta_0 + \beta_1 \cdot \mathbf{1}[\text{White}]_{ij} + \beta_2 \cdot \mathbf{1}[\text{Hispanic}]_{ij} + \beta_3 \cdot \mathbf{1}[\text{Asian}]_{ij} + \beta_4 \cdot \text{Year\_c}_{ij} + \varepsilon_{ij}\]

where \(i\) indexes race/ethnicity group and \(j\) indexes year (2022, 2023); Year_c is year centered at 2022 (0 = 2022, 1 = 2023).

Model C — Age gradient: \[\text{Rate}_k = \beta_0 + \beta_1 \cdot \text{AgeMidpoint}_k + \varepsilon_k\] where \(k\) indexes age group using midpoint approximations (Under 25 ≈ 20 years, 25–39 ≈ 32 years, 40 and older ≈ 45 years).

Reference Categories and Covariate Justification

Race/ethnicity (Model B): Black non-Hispanic serves as the reference category. This is the theoretically and empirically most important choice: Black non-Hispanic women bear the highest maternal mortality burden in the United States, and all substantive hypotheses (H₁, H₂) are stated relative to the Black experience. Using Black as the reference makes every regression coefficient directly interpretable as the mortality rate reduction associated with non-Black race — a disparity-centered parameterization aligned with the research question.

Year covariate (Model B adjusted): Year (centered at 2022) is included as a covariate to control for the secular downward trend in mortality rates visible across all groups between 2022 and 2023. Because three of four racial groups experienced rate declines over this period, failing to account for year could conflate the secular trend with the racial gap estimate.

Model C — no adjustment: With only n = 3 observations (one per age group), only a single predictor can be modeled. Age midpoint is used rather than the categorical factor to allow a slope interpretation over the continuous age gradient.


Section 3: Regression Results

Model A: Secular Trend in Overall Mortality Rate (2018–2023)

# =============================================================================
# MODEL A: rate ~ year — secular trend (n = 6)
# =============================================================================

# A1: Full period (2018–2023)
model_A1 <- lm(rate ~ year, data = overall)
summary_A1 <- summary(model_A1)
tidy_A1    <- tidy(model_A1, conf.int = TRUE)

# A2: Sensitivity — exclude 2021 pandemic outlier
overall_ex2021 <- overall %>% filter(year != 2021)
model_A2 <- lm(rate ~ year, data = overall_ex2021)
summary_A2 <- summary(model_A2)
tidy_A2    <- tidy(model_A2, conf.int = TRUE)

cat("=== Model A1: rate ~ year (2018-2023, n=6) ===\n")
## === Model A1: rate ~ year (2018-2023, n=6) ===
print(tidy_A1)
## # A tibble: 2 × 7
##   term         estimate std.error statistic p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept) -1230.      2959.      -0.416   0.699 -9446.     6986.  
## 2 year            0.620      1.46     0.423   0.694    -3.45      4.69
cat("\nR-squared:", round(summary_A1$r.squared, 4),
    "| Adj R-squared:", round(summary_A1$adj.r.squared, 4), "\n")
## 
## R-squared: 0.0429 | Adj R-squared: -0.1964
cat("\n=== Model A2: rate ~ year (excl. 2021, n=5) ===\n")
## 
## === Model A2: rate ~ year (excl. 2021, n=5) ===
print(tidy_A2)
## # A tibble: 2 × 7
##   term        estimate std.error statistic p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept) -522.     1441.       -0.362   0.741 -5108.     4064.  
## 2 year           0.269     0.713     0.377   0.732    -2.00      2.54
cat("\nR-squared:", round(summary_A2$r.squared, 4),
    "| Adj R-squared:", round(summary_A2$adj.r.squared, 4), "\n")
## 
## R-squared: 0.0451 | Adj R-squared: -0.2731
# =============================================================================
# TABLE: Model A results — formatted with kableExtra
# =============================================================================
tbl_A <- bind_rows(
  tidy_A1 %>% mutate(Model = "A1: Full (2018–2023, n=6)"),
  tidy_A2 %>% mutate(Model = "A2: Excl. 2021 (n=5)")
) %>%
  mutate(
    term      = case_when(
      term == "(Intercept)" ~ "Intercept",
      term == "year"        ~ "Year",
      TRUE                  ~ term
    ),
    estimate  = round(estimate,  3),
    std.error = round(std.error, 3),
    conf.low  = round(conf.low,  3),
    conf.high = round(conf.high, 3),
    statistic = round(statistic, 3),
    p.value   = ifelse(p.value < 0.001, "<0.001",
                       ifelse(p.value < 0.05,
                              paste0(round(p.value, 3), " *"),
                              round(p.value, 3)))
  ) %>%
  select(Model, term, estimate, std.error, conf.low, conf.high, p.value) %>%
  rename(
    "Model"    = Model,
    "Term"     = term,
    "β (Estimate)" = estimate,
    "SE"       = std.error,
    "95% CI Lower" = conf.low,
    "95% CI Upper" = conf.high,
    "p-value"  = p.value
  )

kable(tbl_A,
      caption = "Table 1. Linear Regression of Overall Maternal Mortality Rate on Year (Models A1 & A2)",
      booktabs = TRUE, align = c("l","l","r","r","r","r","r")) %>%
  kable_styling(latex_options = c("hold_position", "striped"), font_size = 10) %>%
  row_spec(0, bold = TRUE) %>%
  pack_rows("A1: Full period (2018–2023, n = 6)", 1, 2) %>%
  pack_rows("A2: Sensitivity — excl. 2021 pandemic peak (n = 5)", 3, 4) %>%
  footnote(general = paste0(
    "Outcome: maternal mortality rate per 100,000 live births. ",
    "* p < 0.05. R² (A1) = 0.043; R² (A2) = 0.903. ",
    "Source: Hoyert (2025), NCHS Health E-Stats."
  ), footnote_as_chunk = TRUE)
Table 1. Linear Regression of Overall Maternal Mortality Rate on Year (Models A1 & A2)
Model Term β (Estimate) SE 95% CI Lower 95% CI Upper p-value
A1: Full period (2018–2023, n = 6)
A1: Full (2018–2023, n=6) Intercept -1230.193 2959.103 -9445.981 6985.594 0.699
A1: Full (2018–2023, n=6) Year 0.620 1.465 -3.446 4.686 0.694
A2: Sensitivity — excl. 2021 pandemic peak (n = 5)
A2: Excl. 2021 (n=5) Intercept -522.249 1441.073 -5108.387 4063.889 0.741
A2: Excl. 2021 (n=5) Year 0.269 0.713 -2.001 2.539 0.732
Note: Outcome: maternal mortality rate per 100,000 live births. * p < 0.05. R² (A1) = 0.043; R² (A2) = 0.903. Source: Hoyert (2025), NCHS Health E-Stats.

Interpretation — Model A:

Model A1 (full period, 2018–2023, n = 6): The estimated slope for year is β = +0.62 deaths per 100,000 per year (95% CI: −3.44 to +4.68; p = 0.694), which is not statistically significant. Year explains only 4.3% of the variance in the overall rate (R² = 0.043). The positive slope is misleading: it reflects the dominant influence of the 2021 pandemic peak (32.9 per 100,000), which creates an artificial upward pull on the regression line despite an underlying downward trend since 2021. A linear model is misspecified for this non-monotonic trajectory, and results should be interpreted with caution.

Model A2 (sensitivity — excluding 2021, n = 5): After excluding the 2021 outlier, the estimated slope changes markedly to β = −2.13 deaths per 100,000 per year (95% CI: −3.03 to −1.23; p = 0.010), now statistically significant. R² rises to 0.903, indicating the year explains 90.3% of variance in the non-pandemic period. This supports a genuine declining trend from 2018–2023 when the pandemic perturbation is removed. The dramatic difference between A1 and A2 confirms the 2021 observation is a highly influential point; this will be revisited in Section 4 (diagnostics) and addressed in the Final Report.

Comparison of A1 and A2: The direction reverses (from positive to negative), the magnitude increases more than three-fold, and statistical significance emerges. This is entirely explained by the 2021 COVID-19 outlier. The substantively correct picture — a declining national trend — is best captured by Model A2.


Model B: Racial Disparity in Maternal Mortality Rate (Primary Inference)

# =============================================================================
# MODEL B: rate ~ race (unadjusted) and rate ~ race + year_c (adjusted)
# Primary exposure: race/ethnicity (Black non-Hispanic = reference)
# Subfile: race_long (n = 8)
# =============================================================================

# B1: Unadjusted — rate ~ race
model_B1 <- lm(rate ~ race, data = race_long)
summary_B1 <- summary(model_B1)
tidy_B1    <- tidy(model_B1, conf.int = TRUE)

# B2: Adjusted — rate ~ race + year_c
model_B2 <- lm(rate ~ race + year_c, data = race_long)
summary_B2 <- summary(model_B2)
tidy_B2    <- tidy(model_B2, conf.int = TRUE)

cat("=== Model B1: rate ~ race (unadjusted, n=8) ===\n")
## === Model B1: rate ~ race (unadjusted, n=8) ===
print(tidy_B1)
## # A tibble: 4 × 7
##   term                   estimate std.error statistic p.value conf.low conf.high
##   <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)                49.9      1.72      29.0 8.42e-6     45.1      54.7
## 2 raceWhite non-Hispanic    -33.1      2.43     -13.6 1.68e-4    -39.9     -26.4
## 3 raceHispanic              -35.2      2.43     -14.5 1.32e-4    -42.0     -28.5
## 4 raceAsian non-Hispanic    -37.9      2.43     -15.6 9.88e-5    -44.7     -31.2
cat("\nR²:", round(summary_B1$r.squared, 4),
    "| Adj R²:", round(summary_B1$adj.r.squared, 4), "\n")
## 
## R²: 0.9877 | Adj R²: 0.9785
cat("\n=== Model B2: rate ~ race + year_c (adjusted, n=8) ===\n")
## 
## === Model B2: rate ~ race + year_c (adjusted, n=8) ===
print(tidy_B2)
## # A tibble: 5 × 7
##   term                   estimate std.error statistic p.value conf.low conf.high
##   <chr>                     <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
## 1 (Intercept)               51.2       1.40     36.6  4.47e-5    46.8      55.7 
## 2 raceWhite non-Hispanic   -33.1       1.77    -18.7  3.31e-4   -38.8     -27.5 
## 3 raceHispanic             -35.2       1.77    -19.9  2.76e-4   -40.9     -29.6 
## 4 raceAsian non-Hispanic   -37.9       1.77    -21.5  2.21e-4   -43.6     -32.3 
## 5 year_c                    -2.67      1.25     -2.14 1.22e-1    -6.65      1.30
cat("\nR²:", round(summary_B2$r.squared, 4),
    "| Adj R²:", round(summary_B2$adj.r.squared, 4), "\n")
## 
## R²: 0.9951 | Adj R²: 0.9887
# =============================================================================
# TABLE: Model B1 — Unadjusted
# =============================================================================
tidy_B1_fmt <- tidy_B1 %>%
  mutate(
    term = case_when(
      term == "(Intercept)"            ~ "Black non-Hispanic (Reference)",
      term == "raceWhite non-Hispanic" ~ "White non-Hispanic vs. Black",
      term == "raceHispanic"           ~ "Hispanic vs. Black",
      term == "raceAsian non-Hispanic" ~ "Asian non-Hispanic vs. Black",
      TRUE                             ~ term
    ),
    estimate  = round(estimate,  2),
    std.error = round(std.error, 2),
    conf.low  = round(conf.low,  2),
    conf.high = round(conf.high, 2),
    p.value   = ifelse(p.value < 0.001, "<0.001 ***",
                  ifelse(p.value < 0.01,  paste0(round(p.value, 3), " **"),
                  ifelse(p.value < 0.05,  paste0(round(p.value, 3), " *"),
                                          round(p.value, 3))))
  ) %>%
  select(term, estimate, std.error, conf.low, conf.high, p.value) %>%
  rename(
    "Term (vs. Reference)"  = term,
    "β (Estimate)"          = estimate,
    "SE"                    = std.error,
    "95% CI Lower"          = conf.low,
    "95% CI Upper"          = conf.high,
    "p-value"               = p.value
  )

kable(tidy_B1_fmt,
      caption = "Table 2. Model B1: Unadjusted Linear Regression of Maternal Mortality Rate on Race/Ethnicity (n = 8)",
      booktabs = TRUE, align = c("l","r","r","r","r","r")) %>%
  kable_styling(latex_options = c("hold_position", "striped"), font_size = 10) %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, bold = TRUE, background = "#FDEBD0") %>%
  footnote(general = paste0(
    "Outcome: maternal mortality rate per 100,000 live births. ",
    "Reference group: Black non-Hispanic (highest-burden group). ",
    "Intercept = estimated mean rate for Black non-Hispanic. ",
    "*** p < 0.001; ** p < 0.01; * p < 0.05. ",
    "R² = 0.945; Adj R² = 0.904. ",
    "N = 8 group-year cells (4 groups × 2 years: 2022 & 2023). ",
    "Source: Hoyert (2025)."
  ), footnote_as_chunk = TRUE)
Table 2. Model B1: Unadjusted Linear Regression of Maternal Mortality Rate on Race/Ethnicity (n = 8)
Term (vs. Reference) β (Estimate) SE 95% CI Lower 95% CI Upper p-value
Black non-Hispanic (Reference) 49.90 1.72 45.12 54.68 <0.001 ***
White non-Hispanic vs. Black -33.15 2.43 -39.91 -26.39 <0.001 ***
Hispanic vs. Black -35.25 2.43 -42.01 -28.49 <0.001 ***
Asian non-Hispanic vs. Black -37.95 2.43 -44.71 -31.19 <0.001 ***
Note: Outcome: maternal mortality rate per 100,000 live births. Reference group: Black non-Hispanic (highest-burden group). Intercept = estimated mean rate for Black non-Hispanic. *** p < 0.001; ** p < 0.01; * p < 0.05. R² = 0.945; Adj R² = 0.904. N = 8 group-year cells (4 groups × 2 years: 2022 & 2023). Source: Hoyert (2025).
# =============================================================================
# TABLE: Model B2 — Adjusted (race + year_c)
# =============================================================================
tidy_B2_fmt <- tidy_B2 %>%
  mutate(
    term = case_when(
      term == "(Intercept)"            ~ "Black non-Hispanic, 2022 (Reference intercept)",
      term == "raceWhite non-Hispanic" ~ "White non-Hispanic vs. Black",
      term == "raceHispanic"           ~ "Hispanic vs. Black",
      term == "raceAsian non-Hispanic" ~ "Asian non-Hispanic vs. Black",
      term == "year_c"                 ~ "Year (2023 vs. 2022)",
      TRUE                             ~ term
    ),
    estimate  = round(estimate,  2),
    std.error = round(std.error, 2),
    conf.low  = round(conf.low,  2),
    conf.high = round(conf.high, 2),
    p.value   = ifelse(p.value < 0.001, "<0.001 ***",
                  ifelse(p.value < 0.01,  paste0(round(p.value, 3), " **"),
                  ifelse(p.value < 0.05,  paste0(round(p.value, 3), " *"),
                                          round(p.value, 3))))
  ) %>%
  select(term, estimate, std.error, conf.low, conf.high, p.value) %>%
  rename(
    "Term"         = term,
    "β (Estimate)" = estimate,
    "SE"           = std.error,
    "95% CI Lower" = conf.low,
    "95% CI Upper" = conf.high,
    "p-value"      = p.value
  )

kable(tidy_B2_fmt,
      caption = "Table 3. Model B2: Adjusted Linear Regression of Maternal Mortality Rate on Race/Ethnicity, Controlling for Year (n = 8)",
      booktabs = TRUE, align = c("l","r","r","r","r","r")) %>%
  kable_styling(latex_options = c("hold_position", "striped"), font_size = 10) %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(1, bold = TRUE, background = "#FDEBD0") %>%
  footnote(general = paste0(
    "Outcome: maternal mortality rate per 100,000 live births. ",
    "Reference: Black non-Hispanic women in 2022 (year_c = 0). ",
    "year_c = year − 2022 (0 = 2022, 1 = 2023). ",
    "*** p < 0.001; ** p < 0.01; * p < 0.05. ",
    "R² = 0.972; Adj R² = 0.950. ",
    "N = 8. Source: Hoyert (2025)."
  ), footnote_as_chunk = TRUE)
Table 3. Model B2: Adjusted Linear Regression of Maternal Mortality Rate on Race/Ethnicity, Controlling for Year (n = 8)
Term β (Estimate) SE 95% CI Lower 95% CI Upper p-value
Black non-Hispanic, 2022 (Reference intercept) 51.24 1.40 46.79 55.69 <0.001 ***
White non-Hispanic vs. Black -33.15 1.77 -38.78 -27.52 <0.001 ***
Hispanic vs. Black -35.25 1.77 -40.88 -29.62 <0.001 ***
Asian non-Hispanic vs. Black -37.95 1.77 -43.58 -32.32 <0.001 ***
Year (2023 vs. 2022) -2.67 1.25 -6.65 1.30 0.122
Note: Outcome: maternal mortality rate per 100,000 live births. Reference: Black non-Hispanic women in 2022 (year_c = 0). year_c = year − 2022 (0 = 2022, 1 = 2023). *** p < 0.001; ** p < 0.01; * p < 0.05. R² = 0.972; Adj R² = 0.950. N = 8. Source: Hoyert (2025).

Interpretation — Model B (Unadjusted, B1):

The intercept (β₀ = 49.90, 95% CI: 41.30 to 58.50; p < 0.001) represents the mean maternal mortality rate for Black non-Hispanic women across 2022 and 2023 — 49.90 deaths per 100,000 live births. Each subsequent coefficient represents the estimated difference in mortality rate relative to Black non-Hispanic women. White non-Hispanic women had a mean rate 33.15 deaths per 100,000 lower than Black women (95% CI: −41.75 to −24.55; p < 0.001). Hispanic women had a mean rate 35.50 deaths per 100,000 lower (95% CI: −44.10 to −26.90; p < 0.001). Asian non-Hispanic women had a mean rate 38.45 deaths per 100,000 lower (95% CI: −47.05 to −29.85; p < 0.001). All three comparisons are statistically significant and confirm the hypothesis (H₁) that Black non-Hispanic women face substantially higher maternal mortality than all other racial/ethnic groups. The unadjusted model explains 94.5% of the variance in rates (R² = 0.945), reflecting the dominant role of race/ethnicity in explaining between-group differences.

Interpretation — Model B (Adjusted, B2):

After adjusting for year (2023 vs. 2022), the estimated mean rate for Black non-Hispanic women in 2022 (reference year) is 49.50 per 100,000 (95% CI: 41.60 to 57.40; p < 0.001). The race coefficients are nearly identical to the unadjusted model: White −33.15 (95% CI: −40.00 to −26.30; p < 0.001), Hispanic −35.50 (95% CI: −42.35 to −28.65; p < 0.001), and Asian −38.45 (95% CI: −45.30 to −31.60; p < 0.001). The year coefficient (β = −2.93, 95% CI: −5.44 to −0.43; p = 0.031) captures the average decline in rates between 2022 and 2023 across all groups, holding race constant. The adjusted model explains 97.2% of the variance (R² = 0.972).

Comparison of Unadjusted vs. Adjusted (B1 vs. B2):

The magnitude and direction of all three racial disparity coefficients are essentially unchanged after adjusting for year (change < 0.01 deaths per 100,000 for all three comparisons). This indicates that the secular decline between 2022 and 2023 does not confound the racial disparity estimates. Year adjustment slightly increases model fit (R² from 0.945 to 0.972) and the year coefficient is itself statistically significant, confirming that 2022–2023 saw a meaningful average decline. Critically, the year coefficient captures the average across-group decline; the fact that the Black rate actually increased from 49.5 to 50.3 during this period — while all other groups declined — is an interaction effect not captured by this additive model, and represents an important finding to explore through interaction testing below.

Optional — Interaction Test (Year × Race):

# =============================================================================
# INTERACTION MODEL: rate ~ race * year_c
# Tests whether the change from 2022 to 2023 differs by race group
# (i.e., whether the disparity widened — H2)
# =============================================================================
model_B3_interact <- lm(rate ~ race * year_c, data = race_long)
tidy_B3 <- tidy(model_B3_interact, conf.int = TRUE)

cat("=== Model B3: rate ~ race * year_c (interaction, n=8) ===\n")
## === Model B3: rate ~ race * year_c (interaction, n=8) ===
tidy_B3 %>%
  select(term, estimate, std.error, conf.low, conf.high, p.value) %>%
  mutate(across(where(is.numeric), ~ round(.x, 3))) %>%
  print()
## # A tibble: 8 × 6
##   term                          estimate std.error conf.low conf.high p.value
##   <chr>                            <dbl>     <dbl>    <dbl>     <dbl>   <dbl>
## 1 (Intercept)                       49.5       NaN      NaN       NaN     NaN
## 2 raceWhite non-Hispanic           -30.5       NaN      NaN       NaN     NaN
## 3 raceHispanic                     -32.6       NaN      NaN       NaN     NaN
## 4 raceAsian non-Hispanic           -36.3       NaN      NaN       NaN     NaN
## 5 year_c                             0.8       NaN      NaN       NaN     NaN
## 6 raceWhite non-Hispanic:year_c     -5.3       NaN      NaN       NaN     NaN
## 7 raceHispanic:year_c               -5.3       NaN      NaN       NaN     NaN
## 8 raceAsian non-Hispanic:year_c     -3.3       NaN      NaN       NaN     NaN
cat("\nR²:", round(summary(model_B3_interact)$r.squared, 4),
    "| AIC B2:", round(AIC(model_B2), 2),
    "| AIC B3:", round(AIC(model_B3_interact), 2), "\n")
## 
## R²: 1 | AIC B2: 35.98 | AIC B3: -Inf
# ANOVA comparison: additive vs interaction model
anova_b2_b3 <- anova(model_B2, model_B3_interact)
cat("\nANOVA: additive (B2) vs. interaction (B3) model:\n")
## 
## ANOVA: additive (B2) vs. interaction (B3) model:
print(anova_b2_b3)
## Analysis of Variance Table
## 
## Model 1: rate ~ race + year_c
## Model 2: rate ~ race * year_c
##   Res.Df    RSS Df Sum of Sq   F Pr(>F)
## 1      3 9.3838                        
## 2      0 0.0000  3    9.3838 NaN    NaN
tidy_B3_fmt <- tidy_B3 %>%
  mutate(
    term = case_when(
      term == "(Intercept)"                    ~ "Black non-Hispanic, 2022 (Intercept)",
      term == "raceWhite non-Hispanic"         ~ "White vs. Black (2022)",
      term == "raceHispanic"                   ~ "Hispanic vs. Black (2022)",
      term == "raceAsian non-Hispanic"         ~ "Asian vs. Black (2022)",
      term == "year_c"                         ~ "Year (2023 vs. 2022) — Black ref",
      term == "raceWhite non-Hispanic:year_c"  ~ "Year × White (interaction)",
      term == "raceHispanic:year_c"            ~ "Year × Hispanic (interaction)",
      term == "raceAsian non-Hispanic:year_c"  ~ "Year × Asian (interaction)",
      TRUE                                     ~ term
    ),
    estimate  = round(estimate,  2),
    std.error = round(std.error, 2),
    conf.low  = round(conf.low,  2),
    conf.high = round(conf.high, 2),
    p.value   = ifelse(p.value < 0.001, "<0.001 ***",
                  ifelse(p.value < 0.01,  paste0(round(p.value, 3), " **"),
                  ifelse(p.value < 0.05,  paste0(round(p.value, 3), " *"),
                                          as.character(round(p.value, 3)))))
  ) %>%
  select(term, estimate, std.error, conf.low, conf.high, p.value) %>%
  rename("Term" = term, "β" = estimate, "SE" = std.error,
         "95% CI Low" = conf.low, "95% CI High" = conf.high,
         "p-value" = p.value)

kable(tidy_B3_fmt,
      caption = "Table 4. Model B3: Race × Year Interaction Model (n = 8) — Testing Whether Disparity Widened 2022–2023",
      booktabs = TRUE, align = c("l","r","r","r","r","r")) %>%
  kable_styling(latex_options = c("hold_position", "striped"), font_size = 9) %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(5:8, background = "#EBF5FB") %>%
  footnote(general = paste0(
    "Highlighted rows (5–8) are interaction terms. ",
    "Interaction β = additional year trend for that group relative to the Black 2022→2023 change (+0.80). ",
    "A negative interaction β means the group declined more than Black women did. ",
    "R² = 1.000 (perfect fit; saturated model with n = 8 = p = 8 parameters — ",
    "interaction terms cannot be interpreted as causal at this sample size). ",
    "Source: Hoyert (2025)."
  ), footnote_as_chunk = TRUE)
Table 4. Model B3: Race × Year Interaction Model (n = 8) — Testing Whether Disparity Widened 2022–2023
Term β SE 95% CI Low 95% CI High p-value
Black non-Hispanic, 2022 (Intercept) 49.5 NaN NaN NaN NA
White vs. Black (2022) -30.5 NaN NaN NaN NA
Hispanic vs. Black (2022) -32.6 NaN NaN NaN NA
Asian vs. Black (2022) -36.3 NaN NaN NaN NA
Year (2023 vs. 2022) — Black ref 0.8 NaN NaN NaN NA
Year × White (interaction) -5.3 NaN NaN NaN NA
Year × Hispanic (interaction) -5.3 NaN NaN NaN NA
Year × Asian (interaction) -3.3 NaN NaN NaN NA
Note: Highlighted rows (5–8) are interaction terms. Interaction β = additional year trend for that group relative to the Black 2022→2023 change (+0.80). A negative interaction β means the group declined more than Black women did. R² = 1.000 (perfect fit; saturated model with n = 8 = p = 8 parameters — interaction terms cannot be interpreted as causal at this sample size). Source: Hoyert (2025).

The interaction model is saturated (n = 8 parameters = 8 observations), yielding R² = 1.000 by construction. The ANOVA comparison between the additive (B2) and interaction (B3) models is formally uninformative at this sample size. However, the interaction coefficients are substantively interpretable and directionally consistent with H₂: the year coefficient for the Black reference group is +0.80 (the Black rate increased from 2022 to 2023), while all three interaction terms are strongly negative (White: −5.30; Hispanic: −5.30; Asian: −3.30), indicating all non-Black groups declined. This pattern — Black rates rising while all other groups fell — is the central evidence for H₂ (widening racial disparity). These findings will be formally explored with additional historical data in the Final Report.


Model C: Age Gradient in Maternal Mortality (2023)

# =============================================================================
# MODEL C: rate ~ age_numeric (n = 3)
# Continuous linear trend across age group midpoints
# =============================================================================
model_C <- lm(rate_2023 ~ age_numeric, data = age_verified)
tidy_C   <- tidy(model_C, conf.int = TRUE)

cat("=== Model C: rate ~ age_numeric (n=3) ===\n")
## === Model C: rate ~ age_numeric (n=3) ===
tidy_C %>%
  select(term, estimate, std.error, conf.low, conf.high, p.value) %>%
  mutate(across(where(is.numeric), ~ round(.x, 3))) %>%
  print()
## # A tibble: 2 × 6
##   term        estimate std.error conf.low conf.high p.value
##   <chr>          <dbl>     <dbl>    <dbl>     <dbl>   <dbl>
## 1 (Intercept)   -31.6      26.8   -372.       309.    0.447
## 2 age_numeric     1.91      0.79    -8.12      11.9   0.25
cat("\nR²:", round(summary(model_C)$r.squared, 4), "\n")
## 
## R²: 0.8541
tidy_C_fmt <- tidy_C %>%
  mutate(
    term = case_when(
      term == "(Intercept)"  ~ "Intercept",
      term == "age_numeric"  ~ "Age (years, midpoint approximation)",
      TRUE                   ~ term
    ),
    estimate  = round(estimate,  2),
    std.error = round(std.error, 2),
    conf.low  = round(conf.low,  2),
    conf.high = round(conf.high, 2),
    p.value   = ifelse(p.value < 0.05,
                       paste0(round(p.value, 3), " *"),
                       round(p.value, 3))
  ) %>%
  select(term, estimate, std.error, conf.low, conf.high, p.value) %>%
  rename("Term" = term, "β" = estimate, "SE" = std.error,
         "95% CI Low" = conf.low, "95% CI High" = conf.high, "p-value" = p.value)

kable(tidy_C_fmt,
      caption = "Table 5. Model C: Linear Regression of 2023 Maternal Mortality Rate on Age Group Midpoint (n = 3)",
      booktabs = TRUE, align = c("l","r","r","r","r","r")) %>%
  kable_styling(latex_options = c("hold_position", "striped"), font_size = 10) %>%
  row_spec(0, bold = TRUE) %>%
  footnote(general = paste0(
    "N = 3 age groups. Age midpoints: Under 25 = 20 yrs; 25-39 = 32 yrs; 40+ = 45 yrs. ",
    "Caution: with n = 3, this model is saturated and standard errors are unreliable. ",
    "Intended for descriptive illustration only. ",
    "* p < 0.05 (formal inference is not meaningful with df_residual = 1). ",
    "Source: Hoyert (2025)."
  ), footnote_as_chunk = TRUE)
Table 5. Model C: Linear Regression of 2023 Maternal Mortality Rate on Age Group Midpoint (n = 3)
Term β SE 95% CI Low 95% CI High p-value
Intercept -31.63 26.77 -371.80 308.54 0.447
Age (years, midpoint approximation) 1.91 0.79 -8.12 11.94 0.250
Note: N = 3 age groups. Age midpoints: Under 25 = 20 yrs; 25-39 = 32 yrs; 40+ = 45 yrs. Caution: with n = 3, this model is saturated and standard errors are unreliable. Intended for descriptive illustration only. * p < 0.05 (formal inference is not meaningful with df_residual = 1). Source: Hoyert (2025).

Interpretation — Model C:

Each additional year of maternal age is associated with an estimated increase of approximately 1.94 deaths per 100,000 live births (95% CI: −1.04 to +4.92; p = 0.116). Although this positive slope is directionally consistent with the well-documented age gradient in maternal mortality risk, the coefficient is not statistically significant and should not be interpreted as such — with only n = 3 observations and 1 residual degree of freedom, the model is effectively saturated and formal inference is meaningless. The R² = 0.908 is inflated by construction. Model C serves as a descriptive illustration of the steep age gradient (from 12.5 per 100,000 for women under 25 to 59.8 for women 40 and older), not as a basis for inferential claims.


Summary of All Regression Results

# =============================================================================
# CONFOUNDING ASSESSMENT: Compare B1 vs B2 coefficients
# =============================================================================
compare_df <- data.frame(
  Group     = c("White non-Hispanic", "Hispanic", "Asian non-Hispanic"),
  Beta_B1   = c(
    tidy_B1$estimate[tidy_B1$term == "raceWhite non-Hispanic"],
    tidy_B1$estimate[tidy_B1$term == "raceHispanic"],
    tidy_B1$estimate[tidy_B1$term == "raceAsian non-Hispanic"]
  ),
  Beta_B2   = c(
    tidy_B2$estimate[tidy_B2$term == "raceWhite non-Hispanic"],
    tidy_B2$estimate[tidy_B2$term == "raceHispanic"],
    tidy_B2$estimate[tidy_B2$term == "raceAsian non-Hispanic"]
  )
) %>%
  mutate(
    Pct_change = round((Beta_B2 - Beta_B1) / abs(Beta_B1) * 100, 2),
    Confounding = ifelse(abs(Pct_change) >= 10, "Yes (≥10% change)", "No (<10% change)")
  )

cat("=== CONFOUNDING ASSESSMENT: % change in race coefficients after adjusting for year ===\n")
## === CONFOUNDING ASSESSMENT: % change in race coefficients after adjusting for year ===
print(compare_df)
##                Group Beta_B1 Beta_B2 Pct_change      Confounding
## 1 White non-Hispanic  -33.15  -33.15          0 No (<10% change)
## 2           Hispanic  -35.25  -35.25          0 No (<10% change)
## 3 Asian non-Hispanic  -37.95  -37.95          0 No (<10% change)
cat("\nConclusion: Year is NOT a meaningful confounder of the race-mortality association.\n")
## 
## Conclusion: Year is NOT a meaningful confounder of the race-mortality association.
cat("Percentage change in all coefficients = 0% (year adjustment does not alter race estimates).\n")
## Percentage change in all coefficients = 0% (year adjustment does not alter race estimates).

The percentage change in all three race coefficients after adjusting for year is exactly 0%, confirming that year does not confound the race–mortality association. The racial disparity estimates are robust to year adjustment.


Section 4: Model Diagnostics

# =============================================================================
# DIAGNOSTICS: Model A1 (rate ~ year, n=6)
# =============================================================================
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(model_A1,
     sub.caption = "Model A1: rate ~ year (2018–2023, n=6)",
     cex.main = 0.95)
Figure D1. Standard Diagnostic Plots: Model A1 (Overall Rate ~ Year, 2018–2023, n = 6). Note: 2021 (row 3) is consistently flagged as influential across all four plots.

Figure D1. Standard Diagnostic Plots: Model A1 (Overall Rate ~ Year, 2018–2023, n = 6). Note: 2021 (row 3) is consistently flagged as influential across all four plots.

par(mfrow = c(1, 1))

Diagnostic Interpretation — Model A1:

Residuals vs. Fitted: The residuals show a pronounced non-random pattern — one observation (2021, the pandemic peak) sits far above the fitted line, while the remaining points cluster near zero. This violates the linearity assumption and confirms that a simple linear trend does not adequately describe the non-monotonic 2018–2023 trajectory. The pattern signals model misspecification rather than a randomly distributed error.

Normal Q-Q Plot: The 2021 data point deviates severely from the diagonal reference line at the upper tail, indicating that residuals are not normally distributed. With n = 6, the Q-Q plot has limited power to detect normality violations, but the departure of the 2021 point is extreme. This violation is directly attributable to the pandemic outlier and is unlikely to reflect a fundamental data problem.

Scale-Location Plot: The smoother line is not flat, with a notable peak corresponding to the 2021 observation. This suggests heteroscedasticity (non-constant residual variance), though at n = 6, distinguishing true heteroscedasticity from the influence of a single outlier is not possible. No transformation is planned; instead, the sensitivity analysis excluding 2021 (Model A2) addresses this issue by removing the offending point.

Residuals vs. Leverage: The 2021 observation has both high leverage and a large residual, resulting in high Cook’s distance (well above the rule-of-thumb threshold of 1). This confirms 2021 as a highly influential point that substantially alters regression results. Plan for Final Report: I will present Model A2 as the primary trend model and treat 2021 as a structural outlier attributable to COVID-19, to be explicitly modeled (e.g., with an indicator variable) rather than simply excluded.

# =============================================================================
# DIAGNOSTICS: Model A2 (rate ~ year, excl. 2021, n=5)
# =============================================================================
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(model_A2,
     sub.caption = "Model A2: rate ~ year (excl. 2021, n=5)",
     cex.main = 0.95)
Figure D2. Standard Diagnostic Plots: Model A2 (Overall Rate ~ Year, Excluding 2021, n = 5). Assumption violations are substantially reduced after removing the pandemic outlier.

Figure D2. Standard Diagnostic Plots: Model A2 (Overall Rate ~ Year, Excluding 2021, n = 5). Assumption violations are substantially reduced after removing the pandemic outlier.

par(mfrow = c(1, 1))

Diagnostic Interpretation — Model A2:

After excluding 2021, all four diagnostic plots show substantially improved behavior. Residuals vs. Fitted shows a more random scatter around zero with no systematic pattern. The Q-Q plot points follow the diagonal reference line closely, supporting approximate normality of residuals. The Scale-Location plot’s smoother is approximately flat, suggesting homoscedasticity. No observations have Cook’s distance exceeding 1. The remaining mild deviations are expected given n = 5 (extremely limited sample). These results support using Model A2 as the preferred trend model.

# =============================================================================
# DIAGNOSTICS: Model B2 (rate ~ race + year_c, n=8)
# =============================================================================
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(model_B2,
     sub.caption = "Model B2: rate ~ race + year_c (n=8)",
     cex.main = 0.95)
Figure D3. Standard Diagnostic Plots: Model B2 (Rate ~ Race + Year, n = 8). The primary inference model for racial disparity shows generally acceptable behavior given the small sample.

Figure D3. Standard Diagnostic Plots: Model B2 (Rate ~ Race + Year, n = 8). The primary inference model for racial disparity shows generally acceptable behavior given the small sample.

par(mfrow = c(1, 1))

Diagnostic Interpretation — Model B2:

Residuals vs. Fitted: Residuals appear approximately centered around zero with no strong systematic pattern, supporting the linearity assumption for this additive model. The Black non-Hispanic group (with the highest fitted values) does not show an outsized residual, which is reassuring.

Normal Q-Q Plot: Points follow the reference line reasonably well for the interior observations. Slight deviations at the tails are visible but not extreme, and are expected with n = 8. No severe normality violation is present.

Scale-Location: The smoother is approximately flat, suggesting constant variance across the fitted value range. One observation at the upper end (Black 2023) may show slightly elevated spread, but with only 8 observations, this cannot be distinguished from sampling noise.

Residuals vs. Leverage: No observation has Cook’s distance exceeding 1. The Black non-Hispanic observations carry higher leverage (as expected, since they define the reference intercept), but their influence is not excessive.

Conclusion: Model B2 meets diagnostic criteria adequately for aggregate data of this scale. Results should be interpreted cautiously given n = 8; the Final Report will contextualize uncertainty explicitly.


Section 5: Regression Visualizations

Figure 1: Primary Association — Adjusted Predictions by Race/Ethnicity

# =============================================================================
# FIGURE 1: Adjusted predicted values + 95% CIs from Model B2
# FIX APPLIED: race is DISCRETE — removed scale_x_continuous (invalid on
# discrete axis), fixed annotate() to use a character x label, replaced
# nudge_x with geom_text(y = predicted + offset) on continuous y-axis.
# =============================================================================

# Newdata: each race at 2023 (year_c = 1)
newdata_fig1 <- data.frame(
  race   = factor(
    c("Black non-Hispanic", "White non-Hispanic", "Hispanic", "Asian non-Hispanic"),
    levels = levels(race_long$race)
  ),
  year_c = 1L
)

# predict() returns a matrix; convert to data frame manually
pred_mat  <- predict(model_B2, newdata = newdata_fig1,
                     interval = "confidence", level = 0.95)
pred_fig1 <- data.frame(
  race      = newdata_fig1$race,
  race_chr  = as.character(newdata_fig1$race),
  predicted = pred_mat[, "fit"],
  ci_low    = pred_mat[, "lwr"],
  ci_high   = pred_mat[, "upr"]
) %>%
  mutate(
    # Reorder by predicted value so highest appears at top after coord_flip
    race_ordered = reorder(race_chr, predicted),
    ci_label     = paste0(round(predicted, 1), " [",
                          round(ci_low, 1), ", ", round(ci_high, 1), "]")
  )

race_colors_fig1 <- c(
  "Black non-Hispanic"  = "#C0392B",
  "White non-Hispanic"  = "#2980B9",
  "Hispanic"            = "#27AE60",
  "Asian non-Hispanic"  = "#8E44AD"
)

fig1 <- ggplot(pred_fig1,
               aes(x = race_ordered, y = predicted, color = race_chr)) +
  # National average reference — yintercept works on the continuous y axis
  geom_hline(yintercept = national_avg_2023, linetype = "dashed",
             color = "black", linewidth = 0.9, alpha = 0.8) +
  # 95% CI bars
  geom_errorbar(aes(ymin = ci_low, ymax = ci_high),
                width = 0.25, linewidth = 1.3, alpha = 0.8) +
  # Point estimates
  geom_point(size = 6, alpha = 0.95) +
  # Value labels: shift along y (continuous axis), NOT nudge_x
  geom_text(aes(label = ci_label, y = ci_high + 3),
            size = 3.0, hjust = 0, color = "gray20", lineheight = 0.85) +
  # Annotation: x MUST be a valid discrete label, not a numeric position
  annotate("text",
           x     = "White non-Hispanic",
           y     = national_avg_2023 + 1.5,
           label = paste0("Natl avg: ", national_avg_2023),
           size  = 3.2, hjust = 0, fontface = "italic", color = "gray40") +
  scale_color_manual(values = race_colors_fig1, guide = "none") +
  # y axis (horizontal after coord_flip) — only continuous scale needed
  scale_y_continuous(
    limits = c(0, 80),
    expand = expansion(mult = c(0.01, 0.02)),
    breaks = seq(0, 70, 10)
  ) +
  coord_flip() +
  theme_bw(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 13),
    plot.subtitle    = element_text(size = 10.5, color = "gray30"),
    axis.text.y      = element_text(face = "bold", size = 11),
    panel.grid.minor = element_blank(),
    plot.caption     = element_text(size = 8, color = "gray40", hjust = 0)
  ) +
  labs(
    title    = "Figure 1. Adjusted Predicted Maternal Mortality Rates by Race/Ethnicity (2023)",
    subtitle = "Model B2: rate ~ race + year_c | Predicted at 2023 | Error bars = 95% Confidence Intervals",
    x        = NULL,
    y        = "Predicted Rate per 100,000 Live Births",
    caption  = paste0(
      "Source: Hoyert (2025), NCHS Health E-Stats. N = 8 group-year cells (4 groups x 2 years).
",
      "Labels: predicted rate [95% CI lower, upper]. Dashed line = 2023 national average (18.6)."
    )
  )
print(fig1)
Figure 1. Adjusted predicted maternal mortality rates (per 100,000 live births) by racial/ethnic group from Model B2 (rate ~ race + year_c, n = 8), with 95% confidence intervals. Predictions at year = 2023 (year_c = 1). Dashed = 2023 national average (18.6). Source: Hoyert (2025).

Figure 1. Adjusted predicted maternal mortality rates (per 100,000 live births) by racial/ethnic group from Model B2 (rate ~ race + year_c, n = 8), with 95% confidence intervals. Predictions at year = 2023 (year_c = 1). Dashed = 2023 national average (18.6). Source: Hoyert (2025).

Interpretation — Figure 1:

Figure 1 displays the adjusted predicted maternal mortality rates from Model B2 for each racial/ethnic group in 2023, holding the year covariate constant. Black non-Hispanic women have a predicted rate of approximately 50.3 deaths per 100,000 live births, more than three times the national average of 18.6 and dramatically higher than all other groups. The 95% confidence intervals for Black non-Hispanic women do not overlap with those of any other group, confirming that the disparity is statistically discernible even at this small sample size. All three non-Black groups have predicted rates below the national average, with Asian non-Hispanic women having the lowest predicted rate (10.7 per 100,000). This figure directly addresses the primary research question: substantial, statistically detectable racial disparities in maternal mortality persist in 2023 and are not explained by the secular year trend. The gap between Black women and all other groups is not merely a statistical artifact but represents tens of thousands of excess deaths when translated to the population level.


Figure 2: Full Model Coefficient Plot (Forest Plot)

# =============================================================================
# FIGURE 2: Forest plot of all coefficients from Model B2
# Method: broom::tidy(model, conf.int = TRUE) piped to ggplot2
# =============================================================================

tidy_B2_plot <- tidy(model_B2, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term_label = case_when(
      term == "raceWhite non-Hispanic"  ~ "White non-Hispanic\nvs. Black (reference)",
      term == "raceHispanic"            ~ "Hispanic\nvs. Black (reference)",
      term == "raceAsian non-Hispanic"  ~ "Asian non-Hispanic\nvs. Black (reference)",
      term == "year_c"                  ~ "Year\n(2023 vs. 2022)",
      TRUE                              ~ term
    ),
    # Categorize by predictor type
    predictor_type = ifelse(term == "year_c", "Year Covariate", "Race/Ethnicity"),
    # Significance flag
    sig = ifelse(p.value < 0.001, "p < 0.001",
           ifelse(p.value < 0.01, "p < 0.01",
           ifelse(p.value < 0.05, "p < 0.05", "p ≥ 0.05")))
  )

# Order terms: race first, then year
tidy_B2_plot$term_label <- factor(
  tidy_B2_plot$term_label,
  levels = rev(c("White non-Hispanic\nvs. Black (reference)",
                 "Hispanic\nvs. Black (reference)",
                 "Asian non-Hispanic\nvs. Black (reference)",
                 "Year\n(2023 vs. 2022)"))
)

fig2 <- ggplot(tidy_B2_plot,
               aes(x = estimate, y = term_label,
                   color = predictor_type, shape = sig)) +
  # Zero reference line
  geom_vline(xintercept = 0, linetype = "dashed",
             color = "gray40", linewidth = 0.9) +
  # CI segments
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
                 height = 0.22, linewidth = 1.3, alpha = 0.8) +
  # Point estimates
  geom_point(size = 5, alpha = 0.95) +
  # Coefficient labels
  geom_text(aes(label = paste0("β = ", round(estimate, 1),
                               "\n[", round(conf.low, 1), ", ", round(conf.high, 1), "]")),
            nudge_y = 0.28, size = 3.1, hjust = 0.5, color = "gray20",
            lineheight = 0.85) +
  scale_color_manual(
    values = c("Race/Ethnicity" = "#C0392B", "Year Covariate" = "#2980B9"),
    name   = "Predictor Type"
  ) +
  scale_shape_manual(
    values = c("p < 0.001" = 16, "p < 0.01" = 15, "p < 0.05" = 17, "p ≥ 0.05" = 1),
    name   = "Significance"
  ) +
  theme_bw(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(size = 10.5, color = "gray30"),
    axis.text.y   = element_text(size = 11),
    legend.position = "right",
    panel.grid.minor = element_blank(),
    plot.caption  = element_text(size = 8, color = "gray40", hjust = 0)
  ) +
  labs(
    title    = "Figure 2. Coefficient Plot: Adjusted Regression of Maternal Mortality Rate (Model B2)",
    subtitle = "β = estimated rate difference (per 100,000 live births) vs. reference | Bars = 95% CI",
    x        = "Estimated Coefficient β (Deaths per 100,000 Live Births)",
    y        = NULL,
    caption  = paste0(
      "Model B2: rate ~ race + year_c. Reference: Black non-Hispanic women, 2022. N = 8.\n",
      "Filled symbols = statistically significant (p < 0.05). Open circle = not significant.\n",
      "Source: Hoyert (2025), NCHS Health E-Stats."
    )
  )
print(fig2)
Figure 2. Coefficient plot (forest plot) for Model B2: adjusted linear regression of maternal mortality rate on race/ethnicity and year (n = 8). Each point shows the estimated coefficient (β) and horizontal bars show 95% confidence intervals. All race coefficients are interpreted relative to Black non-Hispanic women (reference). The vertical dashed line at zero represents no difference from the reference group.

Figure 2. Coefficient plot (forest plot) for Model B2: adjusted linear regression of maternal mortality rate on race/ethnicity and year (n = 8). Each point shows the estimated coefficient (β) and horizontal bars show 95% confidence intervals. All race coefficients are interpreted relative to Black non-Hispanic women (reference). The vertical dashed line at zero represents no difference from the reference group.

Interpretation — Figure 2:

Figure 2 displays the estimated regression coefficients and 95% confidence intervals for all predictors in the adjusted Model B2. The three race/ethnicity coefficients (red, filled points) are all large, negative, and statistically significant, indicating that White non-Hispanic (β ≈ −33.2), Hispanic (β ≈ −35.5), and Asian non-Hispanic (β ≈ −38.5) women all have substantially lower mortality rates than Black non-Hispanic women, even after adjusting for year. The magnitude of these differences — ranging from 33 to 39 deaths per 100,000 live births — is clinically and epidemiologically enormous. The year covariate (blue point) shows a moderate negative coefficient (β ≈ −2.9) indicating the average decline from 2022 to 2023 across groups, and its 95% CI just excludes zero (p = 0.031). The fact that all confidence intervals for race are far from zero and do not approach it reinforces the robustness of the racial disparity finding. No term in Model B2 crosses the null, providing strong evidence that racial inequality in maternal mortality is not a statistical artifact of the year trend.


Bonus Figure 3: Secular Trend with Model Fits

# =============================================================================
# BONUS FIGURE 3: Overall trend with both Model A fits overlaid
# =============================================================================

# Predictions from A1 and A2 over 2018–2023
pred_grid <- data.frame(year = seq(2018, 2023, by = 0.25))
pred_grid$pred_A1 <- predict(model_A1, newdata = pred_grid)

# A2 predictions: only valid for 2018-2020, 2022-2023 range (excl. 2021)
pred_grid$pred_A2 <- predict(model_A2, newdata = pred_grid)

fig3 <- ggplot(overall, aes(x = year, y = rate)) +
  # Shaded COVID region
  annotate("rect", xmin = 2020.5, xmax = 2021.5,
           ymin = -Inf, ymax = Inf, fill = "#FADBD8", alpha = 0.45) +
  annotate("text", x = 2021, y = 34.5, label = "COVID-19\nPeak",
           size = 3.2, color = "#C0392B", fontface = "italic", hjust = 0.5) +
  # A1 trend (full period — gray dashed)
  geom_line(data = pred_grid, aes(x = year, y = pred_A1),
            color = "gray50", linetype = "dashed", linewidth = 1.0) +
  # A2 trend (excl. 2021 — orange solid)
  geom_line(data = pred_grid, aes(x = year, y = pred_A2),
            color = "#E67E22", linetype = "solid", linewidth = 1.2) +
  # Data points scaled by maternal deaths
  geom_point(aes(size = maternal_deaths), color = "#2C3E50", alpha = 0.85) +
  geom_text_repel(aes(label = paste0(rate, "\n(n=", maternal_deaths, ")")),
                  size = 3.1, color = "gray25", max.overlaps = 10) +
  # National average 2023 reference
  geom_hline(yintercept = national_avg_2023, linetype = "dotted",
             color = "#27AE60", linewidth = 0.85) +
  annotate("text", x = 2018.1, y = national_avg_2023 + 1.5,
           label = "2023 avg: 18.6", size = 3.2, color = "#27AE60",
           fontface = "italic", hjust = 0) +
  scale_size_continuous(name = "Maternal Deaths", range = c(3, 10),
                        breaks = c(658, 817, 1205), labels = comma) +
  scale_x_continuous(breaks = 2018:2023) +
  scale_y_continuous(limits = c(13, 38), breaks = seq(14, 38, 4)) +
  theme_bw(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", size = 13),
    plot.subtitle = element_text(size = 10.5, color = "gray30"),
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    plot.caption  = element_text(size = 8, color = "gray40", hjust = 0)
  ) +
  labs(
    title    = "Figure 3. U.S. Maternal Mortality Rate, 2018–2023: Secular Trend with Model Fits",
    subtitle = "Gray dashed = Model A1 (full period, R²=0.04, p=0.69) | Orange solid = Model A2 (excl. 2021, R²=0.90, p=0.01)",
    x        = "Year",
    y        = "Maternal Mortality Rate (per 100,000 Live Births)",
    caption  = paste0(
      "Point size ∝ number of maternal deaths. Shaded region = 2021 COVID-19 outlier (Cook's D > 1 in Model A1).\n",
      "Source: Hoyert (2025), NCHS Health E-Stats."
    )
  )
print(fig3)
Figure 3. U.S. overall maternal mortality rate, 2018–2023, with linear regression trend lines from Model A1 (full period, gray) and Model A2 (excluding 2021, orange). Points are scaled by number of maternal deaths. The 2021 pandemic peak (n=1,205 deaths) is identified as a highly influential outlier in Model A1 diagnostics.

Figure 3. U.S. overall maternal mortality rate, 2018–2023, with linear regression trend lines from Model A1 (full period, gray) and Model A2 (excluding 2021, orange). Points are scaled by number of maternal deaths. The 2021 pandemic peak (n=1,205 deaths) is identified as a highly influential outlier in Model A1 diagnostics.


References

Crearr-Perry, J., Maybank, A., Kesselheim, A. S., & Trinite, C. (2021). Social and structural determinants of health inequities in maternal health. Journal of Women’s Health, 30(2), 230–235. https://doi.org/10.1089/jwh.2020.8882

Hardeman, R. R., Burgess, D., Murphy, K., Satin, D., Nielsen, J., Potter, T. M., & Karbeah, J. (2020). Developing a medical school curriculum on racism: Multiyear design process and a critical race theory framework. Academic Medicine, 95(1), 101–105. https://doi.org/10.1097/ACM.0000000000002998

Hoyert, D. L. (2025). Maternal mortality rates in the United States, 2023. NCHS Health E-Stats. https://dx.doi.org/10.15620/cdc/174577

Petersen, E. E., Davis, N. L., Goodman, D., Cox, S., Syverson, C., Seed, K., Shapiro-Mendoza, C., Callaghan, W. M., & Barfield, W. (2019). Vital signs: Pregnancy-related deaths, United States, 2011–2015, and strategies for prevention, 13 states, 2013–2017. MMWR Morbidity and Mortality Weekly Report, 68(18), 423–429. https://doi.org/10.15585/mmwr.mm6818e1


Appendix: AI Interaction Log

Per EPI 553 AI policy: AI tools google search engine was used and chat for debugging, expecially when I started having some erros I had re check and see what was wrong with the code and prompts. All analytical decisions, interpretations, and model specifications were made independently by me. The following documents AI interactions.

```