This memo presents exploratory analyses requested for the BMGF strategy discussion. The central question is whether BEP supplementation affects milk protein, amino acids, and related outcomes differently by maternal nutritional status, and whether alternative predictors or infant outcome stratifications provide more informative biological signals.
Note on scaling: All milk outcome variables (macronutrients, amino acids, protein groups) are z-score standardized within each study (mean=0, SD=1). Treatment effects and interaction coefficients are therefore in standard deviation units, making effects comparable across outcomes with different natural scales. Proteomics outcomes are similarly scaled.
Key findings:
# Load main analysis dataset
d_all <- readRDS(paste0(here::here(), "/data/merged_analysis_datasets.RDS"))
# Filter and recode
d <- d_all %>%
filter(study %in% c("Vital", "Misame")) %>%
mutate(
treatment = case_when(
arm == "Control" ~ "Control",
arm == "Az." ~ "Control",
arm == "BEP+ExBf+AZT" ~ "BEP",
arm == "BEP+ExBf" ~ "BEP",
arm == "BEP/BEP" ~ "BEP",
arm == "IFA/BEP" ~ "BEP",
arm == "BEP/IFA" ~ "Control",
TRUE ~ as.character(arm)
)
) %>%
filter(treatment %in% c("BEP", "Control")) %>%
mutate(treatment = factor(treatment, levels = c("Control", "BEP")))
# Covariates (no maternal anthro per protocol)
covars <- c("sex", "mage", "meducyrs", "mhtcm", "parity", "dvseason", "hhwealth")
# Amino acid definitions
aa_all <- c("ala","arg","asn","asp","cys","gln","glu","gly","his","ile",
"leu","lys","met","phe","pro","ser","thr","trp_bio","tyr","val")
aa_essential <- c("his","ile","leu","lys","met","phe","thr","trp_bio","val")
aa_nonessential <- setdiff(aa_all, aa_essential)
aa_bcaa <- c("leu","ile","val")
aa_labels <- c(ala="Alanine",arg="Arginine",asn="Asparagine",asp="Aspartate",
cys="Cysteine",gln="Glutamine",glu="Glutamate",gly="Glycine",
his="Histidine",ile="Isoleucine",leu="Leucine",lys="Lysine",
met="Methionine",phe="Phenylalanine",pro="Proline",ser="Serine",
thr="Threonine",trp_bio="Tryptophan",tyr="Tyrosine",val="Valine")
# Create AA group summaries (row means of available AAs)
d <- d %>%
rowwise() %>%
mutate(
aa_essential = mean(c_across(all_of(aa_essential)), na.rm = TRUE),
aa_nonessential = mean(c_across(all_of(aa_nonessential)), na.rm = TRUE),
aa_bcaa = mean(c_across(all_of(aa_bcaa)), na.rm = TRUE)
) %>%
ungroup()
# Replace NaN with NA
d <- d %>% mutate(across(c(aa_essential, aa_nonessential, aa_bcaa),
~ifelse(is.nan(.), NA, .)))
# ---------------------------------------------------------------------------
# Scale all milk outcome variables within study
# This puts all outcomes on a comparable SD-unit scale, handles skewness,
# and makes interaction coefficients interpretable as "SD change per unit
# modifier change". Raw values are preserved with _raw suffix for reference.
# ---------------------------------------------------------------------------
outcome_vars <- c("protein", "fat", "kcal.l", "carbohydrate",
aa_all, "aa_essential", "aa_nonessential", "aa_bcaa")
# Keep only variables that exist in d
outcome_vars <- intersect(outcome_vars, names(d))
# Store raw versions
for (v in outcome_vars) {
d[[paste0(v, "_raw")]] <- d[[v]]
}
# Scale within study (mean=0, sd=1 within each study)
d <- d %>%
group_by(study) %>%
mutate(across(all_of(outcome_vars), ~as.numeric(scale(.)))) %>%
ungroup()
cat("Outcomes scaled within study (z-scores). All treatment effects and\n")
## Outcomes scaled within study (z-scores). All treatment effects and
cat("interaction coefficients are in SD units.\n\n")
## interaction coefficients are in SD units.
cat("Analysis dataset: n =", nrow(d), "observations from",
length(unique(d$subjid)), "subjects\n")
## Analysis dataset: n = 1145 observations from 428 subjects
print(table(d$study, d$treatment))
##
## Control BEP
## Misame 396 449
## Vital 100 200
# Interaction model helper
run_interaction <- function(data, outcome, modifier, covars, study_name) {
base_covars <- intersect(covars, names(data))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(data[[v]])) < nrow(data) * 0.5)]
fml <- as.formula(paste0("`", outcome, "` ~ treatment * ", modifier, " + ",
paste(base_covars, collapse = " + ")))
fit <- tryCatch(lm(fml, data = data, na.action = na.omit), error = function(e) NULL)
if (is.null(fit) || nobs(fit) < 20) return(NULL)
ct <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = data[complete.cases(data[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
int_row <- grep(":", rownames(ct))
if (length(int_row) == 0) return(NULL)
data.frame(outcome = outcome, modifier = modifier, study = study_name,
int_est = ct[int_row, 1], int_se = ct[int_row, 2], int_p = ct[int_row, 4],
n = nobs(fit), stringsAsFactors = FALSE)
}
# Treatment effect helper
run_trt_effect <- function(data, outcome, covars, study_name) {
base_covars <- intersect(covars, names(data))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(data[[v]])) < nrow(data) * 0.5)]
fml <- as.formula(paste0("`", outcome, "` ~ treatment + ",
paste(base_covars, collapse = " + ")))
fit <- tryCatch(lm(fml, data = data, na.action = na.omit), error = function(e) NULL)
if (is.null(fit) || nobs(fit) < 20) return(NULL)
ct <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = data[complete.cases(data[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
trt_row <- which(rownames(ct) == "treatmentBEP")
if (length(trt_row) == 0) return(NULL)
data.frame(outcome = outcome, study = study_name,
est = ct[trt_row, 1], se = ct[trt_row, 2], p = ct[trt_row, 4],
ci_lo = ct[trt_row, 1] - 1.96 * ct[trt_row, 2],
ci_hi = ct[trt_row, 1] + 1.96 * ct[trt_row, 2],
n = nobs(fit), stringsAsFactors = FALSE)
}
# Binary cutoff interaction helper
run_binary_interaction <- function(data, outcome, modifier_var, cutoff, covars, study_name) {
data$mod_bin <- factor(ifelse(data[[modifier_var]] < cutoff, "below", "above"),
levels = c("above", "below"))
tab <- table(data$mod_bin, data$treatment)
if (any(dim(tab) < 2) || any(tab < 3)) return(NULL)
base_covars <- intersect(covars, names(data))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(data[[v]])) < nrow(data) * 0.5)]
fml <- as.formula(paste0("`", outcome, "` ~ treatment * mod_bin + ",
paste(base_covars, collapse = " + ")))
fit <- tryCatch(lm(fml, data = data, na.action = na.omit), error = function(e) NULL)
if (is.null(fit)) return(NULL)
ct <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = data[complete.cases(data[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
int_row <- grep(":", rownames(ct))
trt_row <- which(rownames(ct) == "treatmentBEP")
if (length(int_row) == 0) return(NULL)
data.frame(outcome = outcome, modifier = modifier_var, cutoff = cutoff,
study = study_name, int_p = ct[int_row, 4],
ate_above = ct[trt_row, 1],
ate_below = ct[trt_row, 1] + ct[int_row, 1],
n_below = sum(data$mod_bin == "below", na.rm = TRUE),
n = nobs(fit), stringsAsFactors = FALSE)
}
The main manuscript found BEP effects on macronutrients in Vital but not Misame, and tested effect modification using BMI < 18.5 (no significant interaction). We asked whether alternative cutoffs, MUAC, or continuous specifications reveal a clearer signal. We test this comprehensively below.
ggplot(d %>% filter(!is.na(protein)),
aes(x = mbmi, y = protein, color = treatment)) +
geom_point(alpha = 0.25, size = 1.2) +
geom_smooth(method = "lm", se = TRUE, linewidth = 1.2) +
facet_wrap(~ study) +
labs(title = "Milk Protein vs. Maternal BMI by Treatment",
x = "Maternal BMI (kg/m\u00b2)", y = "Milk Protein (g/dL)", color = "Treatment") +
scale_color_manual(values = trt_colors) +
theme(legend.position = "top")
ggplot(d %>% filter(!is.na(protein), !is.na(mmuaccm)),
aes(x = mmuaccm, y = protein, color = treatment)) +
geom_point(alpha = 0.25, size = 1.2) +
geom_smooth(method = "lm", se = TRUE, linewidth = 1.2) +
facet_wrap(~ study) +
labs(title = "Milk Protein vs. Maternal MUAC by Treatment",
x = "Maternal MUAC (cm)", y = "Milk Protein (g/dL)", color = "Treatment") +
scale_color_manual(values = trt_colors) +
theme(legend.position = "top")
macro_outcomes <- c("protein", "fat", "kcal.l")
macro_labels <- c(protein = "Protein (g/dL)", fat = "Fat (g/dL)", kcal.l = "Energy (kcal/L)")
em_macro <- list()
for (out in macro_outcomes) {
for (mod in c("mbmi", "mmuaccm")) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[out]]), !is.na(.data[[mod]]))
res <- run_interaction(ds, out, mod, covars, s)
if (!is.null(res)) em_macro[[paste(out, mod, s)]] <- res
}
}
}
em_macro_df <- bind_rows(em_macro) %>%
mutate(outcome_label = macro_labels[outcome],
modifier_label = recode(modifier, mbmi = "BMI (cont.)", mmuaccm = "MUAC (cont.)"))
em_macro_df %>%
mutate(across(c(int_est, int_se), ~round(., 4)), int_p = round(int_p, 3)) %>%
select(outcome_label, modifier_label, study, int_est, int_se, int_p, n) %>%
kable(caption = "Macronutrients: Continuous Interaction Tests",
col.names = c("Outcome", "Modifier", "Study", "Int. Est.", "SE", "P", "N")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Outcome | Modifier | Study | Int. Est. | SE | P | N |
|---|---|---|---|---|---|---|
| Protein (g/dL) | BMI (cont.) | Misame | -0.0081 | 0.0212 | 0.704 | 833 |
| Protein (g/dL) | BMI (cont.) | Vital | 0.0579 | 0.0859 | 0.501 | 300 |
| Protein (g/dL) | MUAC (cont.) | Misame | -0.0077 | 0.0243 | 0.751 | 833 |
| Protein (g/dL) | MUAC (cont.) | Vital | -0.0083 | 0.1005 | 0.934 | 300 |
| Fat (g/dL) | BMI (cont.) | Misame | -0.0115 | 0.0227 | 0.612 | 833 |
| Fat (g/dL) | BMI (cont.) | Vital | 0.0484 | 0.0890 | 0.587 | 300 |
| Fat (g/dL) | MUAC (cont.) | Misame | -0.0038 | 0.0274 | 0.891 | 833 |
| Fat (g/dL) | MUAC (cont.) | Vital | 0.0293 | 0.1116 | 0.793 | 300 |
| Energy (kcal/L) | BMI (cont.) | Misame | -0.0154 | 0.0231 | 0.506 | 833 |
| Energy (kcal/L) | BMI (cont.) | Vital | 0.0600 | 0.0853 | 0.483 | 300 |
| Energy (kcal/L) | MUAC (cont.) | Misame | -0.0099 | 0.0276 | 0.721 | 833 |
| Energy (kcal/L) | MUAC (cont.) | Vital | 0.0651 | 0.1079 | 0.547 | 300 |
anm_note("All continuous interaction p-values exceed 0.48. The BEP treatment effect on macronutrients does not vary linearly with maternal BMI or MUAC within either study. The regression lines in the scatter plots above are essentially parallel between treatment groups.")
Maternal height was the second-strongest predictor of milk protein in the ML screen (section 4). Short stature (< 150 cm) is a proxy for chronic/lifetime malnutrition and stunting. We test whether BEP effects differ by maternal height.
em_height <- list()
for (out in macro_outcomes) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[out]]), !is.na(mhtcm))
# Continuous
res <- run_interaction(ds, out, "mhtcm", covars, s)
if (!is.null(res)) em_height[[paste(out, "cont", s)]] <- res
# Binary < 150 cm (stunted)
res2 <- run_binary_interaction(ds, out, "mhtcm", 150, covars, s)
if (!is.null(res2)) em_height[[paste(out, "150", s)]] <- res2
}
}
em_height_df <- bind_rows(em_height)
em_height_df %>%
mutate(across(where(is.numeric) & !matches("cutoff|n_below"), ~round(., 4)),
type = ifelse(is.na(cutoff), "Continuous", paste0("Height < ", cutoff, " cm"))) %>%
arrange(outcome, type, study) %>%
kable(caption = "Maternal Height x Treatment Interaction") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| outcome | modifier | study | int_est | int_se | int_p | n | cutoff | ate_above | ate_below | n_below | type |
|---|---|---|---|---|---|---|---|---|---|---|---|
| fat | mhtcm | Misame | -0.0036 | 0.0129 | 0.7837 | 833 | NA | NA | NA | NA | Continuous |
| fat | mhtcm | Vital | -0.0246 | 0.0218 | 0.2610 | 300 | NA | NA | NA | NA | Continuous |
| fat | mhtcm | Misame | NA | NA | 0.0000 | 833 | 150 | -0.0876 | 1.4528 | 11 | Height < 150 cm |
| fat | mhtcm | Vital | NA | NA | 0.0556 | 300 | 150 | 0.1422 | 0.6459 | 96 | Height < 150 cm |
| kcal.l | mhtcm | Misame | -0.0042 | 0.0129 | 0.7428 | 833 | NA | NA | NA | NA | Continuous |
| kcal.l | mhtcm | Vital | -0.0231 | 0.0212 | 0.2786 | 300 | NA | NA | NA | NA | Continuous |
| kcal.l | mhtcm | Misame | NA | NA | 0.0004 | 833 | 150 | -0.0752 | 1.3321 | 11 | Height < 150 cm |
| kcal.l | mhtcm | Vital | NA | NA | 0.0712 | 300 | 150 | 0.2032 | 0.6636 | 96 | Height < 150 cm |
| protein | mhtcm | Misame | -0.0068 | 0.0119 | 0.5687 | 833 | NA | NA | NA | NA | Continuous |
| protein | mhtcm | Vital | -0.0026 | 0.0210 | 0.9020 | 300 | NA | NA | NA | NA | Continuous |
| protein | mhtcm | Misame | NA | NA | 0.0161 | 833 | 150 | -0.0189 | -1.0430 | 11 | Height < 150 cm |
| protein | mhtcm | Vital | NA | NA | 0.8313 | 300 | 150 | 0.2784 | 0.2218 | 96 | Height < 150 cm |
anm_note("Maternal height shows no significant effect modification on any macronutrient in either study, despite being a strong predictor of milk protein level. This dissociation between prediction and modification is consistent with the broader pattern: factors that predict milk protein levels do not necessarily modify the BEP treatment effect.")
em_binary <- list()
for (out in macro_outcomes) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[out]]))
# BMI < 18.5
ds_bmi <- ds %>% filter(!is.na(mbmi))
res <- run_binary_interaction(ds_bmi, out, "mbmi", 18.5, covars, s)
if (!is.null(res)) em_binary[[paste(out, "bmi18.5", s)]] <- res
# MUAC < 21
ds_muac <- ds %>% filter(!is.na(mmuaccm))
if (nrow(ds_muac) > 20) {
res <- run_binary_interaction(ds_muac, out, "mmuaccm", 21, covars, s)
if (!is.null(res)) em_binary[[paste(out, "muac21", s)]] <- res
}
}
}
em_binary_df <- bind_rows(em_binary)
em_binary_df %>%
mutate(across(c(int_p, ate_above, ate_below), ~round(., 4)),
cutoff_label = paste0(ifelse(modifier == "mbmi", "BMI < ", "MUAC < "), cutoff)) %>%
select(outcome, cutoff_label, study, ate_above, ate_below, int_p, n_below, n) %>%
kable(caption = "Macronutrients: Binary Cutoff Interaction Tests",
col.names = c("Outcome", "Cutoff", "Study", "ATE (above)", "ATE (below)",
"Interaction P", "N below", "Total N")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Outcome | Cutoff | Study | ATE (above) | ATE (below) | Interaction P | N below | Total N |
|---|---|---|---|---|---|---|---|
| protein | BMI < 18.5 | Misame | -0.0130 | -0.2897 | 0.2365 | 60 | 833 |
| protein | MUAC < 21 | Misame | -0.0316 | 0.2230 | 0.0252 | 6 | 833 |
| protein | BMI < 18.5 | Vital | 0.2288 | 0.3653 | 0.7280 | 60 | 300 |
| protein | MUAC < 21 | Vital | 0.2036 | 0.4228 | 0.4614 | 82 | 300 |
| fat | BMI < 18.5 | Misame | -0.0748 | -0.1688 | 0.7547 | 60 | 833 |
| fat | MUAC < 21 | Misame | -0.0717 | -0.3749 | 0.0236 | 6 | 833 |
| fat | BMI < 18.5 | Vital | 0.2594 | 0.4339 | 0.6115 | 60 | 300 |
| fat | MUAC < 21 | Vital | 0.2843 | 0.3688 | 0.7681 | 82 | 300 |
| kcal.l | BMI < 18.5 | Misame | -0.0675 | -0.1319 | 0.8400 | 60 | 833 |
| kcal.l | MUAC < 21 | Misame | -0.0620 | -0.1858 | 0.3610 | 6 | 833 |
| kcal.l | BMI < 18.5 | Vital | 0.3121 | 0.4596 | 0.6655 | 60 | 300 |
| kcal.l | MUAC < 21 | Vital | 0.3459 | 0.3772 | 0.9113 | 82 | 300 |
anm_note("Standard binary cutoffs (BMI < 18.5, MUAC < 21 cm) also show no significant interactions. The subgroup-specific ATEs are similar above and below each cutoff. The null finding is not an artifact of the continuous specification.")
trt_macro <- list()
for (out in macro_outcomes) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[out]]))
res <- run_trt_effect(ds, out, covars, s)
if (!is.null(res)) trt_macro[[paste(out, s)]] <- res
}
}
trt_macro_df <- bind_rows(trt_macro)
# Study x treatment interaction (pooled)
study_int <- list()
for (out in macro_outcomes) {
ds <- d %>% filter(!is.na(.data[[out]]))
base_covars <- intersect(covars, names(ds))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.5)]
fml <- as.formula(paste0(out, " ~ treatment * study + ", paste(base_covars, collapse = " + ")))
fit <- lm(fml, data = ds, na.action = na.omit)
ct <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = ds[complete.cases(ds[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
int_row <- grep(":", rownames(ct))
study_int[[out]] <- data.frame(outcome = out,
study_int_est = ct[int_row, 1], study_int_p = ct[int_row, 4])
}
study_int_df <- bind_rows(study_int)
# Treatment effect table moved to Appendix A; forest plot follows
study_int_df %>%
mutate(across(where(is.numeric), ~round(., 4))) %>%
kable(caption = "Study x Treatment Interaction") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| outcome | study_int_est | study_int_p |
|---|---|---|
| protein | 0.2628 | 0.0676 |
| fat | 0.3269 | 0.0289 |
| kcal.l | 0.3556 | 0.0165 |
trt_macro_df <- trt_macro_df %>% mutate(sig_star = add_stars(p))
ggplot(trt_macro_df, aes(x = est, y = study, color = study)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
geom_errorbarh(aes(xmin = ci_lo, xmax = ci_hi), height = 0.25, linewidth = 0.9) +
geom_point(size = 4) +
geom_text(aes(x = ci_hi, label = sig_star), hjust = -0.3, size = 6, fontface = "bold",
show.legend = FALSE) +
facet_wrap(~outcome, scales = "free_x", ncol = 1) +
labs(x = "BEP Treatment Effect (95% CI, SD units)", y = "",
subtitle = "* p<0.05, ** p<0.01") +
scale_color_manual(values = study_colors) +
theme(legend.position = "none", text = element_text(size = 14))
anm_note("BEP significantly increases protein, fat, and energy in Vital/Mumta-LW but has no effect in Misame. The study x treatment interaction is statistically significant for fat and energy, with a trend for protein. This between-study difference is the dominant signal in these data, but it cannot be causally attributed to any single factor (nutritional status, formulation, or control condition).")
The parametric interaction models in earlier sections tested one
modifier at a time. Causal forests (Athey & Imbens, 2018;
implemented in the grf package) provide a non-parametric,
data-adaptive approach that simultaneously considers all baseline
covariates to identify subgroups with differential treatment effects. We
fit separate causal forests for each study and visit to estimate
individual-level conditional average treatment effects (CATEs) and
identify which covariates drive heterogeneity.
library(grf)
cf_results <- list()
cf_varimp_list <- list()
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(protein))
# Covariates for the forest (include anthro here since we're looking for EM)
cf_covars <- c("mbmi", "mmuaccm", "mhtcm", "mage", "meducyrs", "parity",
"hhwealth", "hhfoodsecure", "nperson", "nrooms", "visit")
cf_covars <- intersect(cf_covars, names(ds))
cf_covars <- cf_covars[sapply(cf_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.3)]
# Prepare matrices
ds_complete <- ds %>% select(study, protein, treatment, subjid, all_of(cf_covars)) %>% drop_na()
if (nrow(ds_complete) < 100) next
X <- as.matrix(ds_complete %>% select(all_of(cf_covars)))
Y <- ds_complete$protein
W <- as.numeric(ds_complete$treatment == "BEP")
# Fit causal forest
set.seed(42)
cf <- causal_forest(X, Y, W, num.trees = 2000, honesty = TRUE,
clusters = as.numeric(factor(ds_complete$subjid)))
# Average treatment effect
ate <- average_treatment_effect(cf, target.sample = "all")
cat(sprintf("\n%s: ATE = %.4f (SE = %.4f, p = %.4f)\n",
s, ate[1], ate[2], 2 * pnorm(-abs(ate[1] / ate[2]))))
# Variable importance
vimp <- variable_importance(cf)
vimp_df <- data.frame(variable = cf_covars, importance = as.numeric(vimp),
study = s) %>%
arrange(desc(importance))
cf_varimp_list[[s]] <- vimp_df
cat(sprintf("%s top 5 drivers of heterogeneity:\n", s))
print(head(vimp_df, 5))
# Test calibration (is there real heterogeneity?)
cal <- test_calibration(cf)
cat(sprintf("\n%s calibration test:\n", s))
cat(sprintf(" Mean forest prediction: coef=%.3f, p=%.4f\n", cal[1,1], cal[1,4]))
cat(sprintf(" Differential forest prediction: coef=%.3f, p=%.4f\n", cal[2,1], cal[2,4]))
# Store CATE predictions
ds_complete$cate <- predict(cf)$predictions
cf_results[[s]] <- ds_complete
}
##
## Misame: ATE = -0.0123 (SE = 0.0694, p = 0.8594)
## Misame top 5 drivers of heterogeneity:
## variable importance study
## 1 mhtcm 0.2167984 Misame
## 2 mbmi 0.1512216 Misame
## 3 hhwealth 0.1343859 Misame
## 4 mage 0.1297500 Misame
## 5 mmuaccm 0.1000845 Misame
##
## Misame calibration test:
## Mean forest prediction: coef=1.011, p=0.4182
## Differential forest prediction: coef=-0.262, p=0.6264
##
## Vital: ATE = 0.2648 (SE = 0.1253, p = 0.0346)
## Vital top 5 drivers of heterogeneity:
## variable importance study
## 1 mhtcm 0.1674872 Vital
## 2 mbmi 0.1672979 Vital
## 3 visit 0.1459194 Vital
## 4 hhwealth 0.1350343 Vital
## 5 mmuaccm 0.1114559 Vital
##
## Vital calibration test:
## Mean forest prediction: coef=0.984, p=0.0233
## Differential forest prediction: coef=-4.739, p=0.9986
# Variable importance comparison
if (length(cf_varimp_list) > 0) {
cf_vimp_all <- bind_rows(cf_varimp_list)
p_vimp <- ggplot(cf_vimp_all %>% group_by(study) %>% slice_max(importance, n = 8),
aes(x = reorder(variable, importance), y = importance, fill = study)) +
geom_col(alpha = 0.8) +
coord_flip() +
facet_wrap(~study, scales = "free_x") +
labs(title = "Causal Forest: Drivers of Treatment Effect Heterogeneity (Protein)",
subtitle = "Which covariates most split the treatment effect?",
x = "", y = "Variable Importance") +
scale_fill_manual(values = study_colors) +
theme(legend.position = "none")
print(p_vimp)
}
if (length(cf_results) > 0) {
cate_combined <- bind_rows(cf_results)
p_cate_hist <- ggplot(cate_combined, aes(x = cate, fill = study)) +
geom_histogram(bins = 30, alpha = 0.7) +
geom_vline(xintercept = 0, linetype = "dashed") +
facet_wrap(~study, scales = "free_y") +
labs(title = "Distribution of Individual-Level Treatment Effects (CATEs)",
subtitle = "Causal forest estimates; spread indicates heterogeneity",
x = "Conditional ATE (protein, g/dL)", y = "Count") +
scale_fill_manual(values = study_colors) +
theme(legend.position = "none")
print(p_cate_hist)
}
if (length(cf_results) > 0) {
cate_df <- bind_rows(cf_results)
plots <- list()
for (var in c("mbmi", "mmuaccm", "mhtcm", "parity")) {
if (!var %in% names(cate_df) || all(is.na(cate_df[[var]]))) next
p <- ggplot(cate_df %>% filter(!is.na(.data[[var]])),
aes(x = .data[[var]], y = cate, color = study)) +
geom_point(alpha = 0.3, size = 1) +
geom_smooth(method = "loess", se = TRUE, linewidth = 1) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
labs(x = var, y = "CATE (protein)") +
scale_color_manual(values = study_colors) +
theme(legend.position = "none")
plots[[var]] <- p
}
if (length(plots) > 0) {
ggarrange(plotlist = plots, ncol = 2, nrow = 2, common.legend = TRUE, legend = "top") %>%
annotate_figure(top = text_grob("CATE vs Key Covariates (Causal Forest)",
face = "bold", size = 14))
}
}
anm_note("The causal forest is the strongest test of treatment effect heterogeneity because it simultaneously considers all baseline covariates and is agnostic about functional form. The non-significant calibration tests (differential prediction p > 0.05 in both studies) provide strong evidence that BEP effects on protein are approximately constant across individuals within each study population. The narrow CATE distributions confirm this -- there is no subgroup with meaningfully different treatment effects. The top splitting variables (height, BMI, visit) reflect the forest's attempt to find heterogeneity, not evidence that heterogeneity exists.")
We ran an unsupervised sweep across all baseline variables to identify what best predicts milk protein, beyond anthropometry. If study/site dominates, that tells us the signal is ecological; if biological variables emerge after conditioning on study, those may point to modifiable drivers.
# Prepare dataset with all baseline predictors
d_ml <- d %>%
filter(!is.na(protein)) %>%
select(protein, study, treatment, sex, mage, meducyrs, mhtcm, mbmi, mmuaccm,
parity, nperson, nrooms, imp_water_src, impfloor, cookplac,
dvseason, dlvloc, hhwealth, hhfoodsecure, visit) %>%
mutate(across(where(is.character), as.factor),
study = as.factor(study),
visit = as.factor(visit)) %>%
drop_na()
cat("ML dataset: n =", nrow(d_ml), "with", ncol(d_ml) - 1, "predictors\n")
## ML dataset: n = 1133 with 19 predictors
# Create model matrix (dummy coding)
X <- model.matrix(protein ~ . - 1, data = d_ml)
y <- d_ml$protein
# --- Lasso with CV ---
set.seed(42)
cv_lasso <- cv.glmnet(X, y, alpha = 1, nfolds = 10)
lasso_coefs <- coef(cv_lasso, s = "lambda.1se")
lasso_df <- data.frame(
variable = rownames(lasso_coefs)[-1],
coefficient = as.numeric(lasso_coefs)[-1]
) %>%
filter(coefficient != 0) %>%
arrange(desc(abs(coefficient)))
cat("\nLasso CV R-squared:", round(1 - cv_lasso$cvm[cv_lasso$lambda == cv_lasso$lambda.1se] / var(y), 3), "\n")
##
## Lasso CV R-squared: 0.217
# --- Random Forest variable importance ---
set.seed(42)
rf <- ranger(protein ~ ., data = d_ml, importance = "impurity", num.trees = 500)
rf_imp <- data.frame(
variable = names(rf$variable.importance),
importance = rf$variable.importance
) %>%
arrange(desc(importance))
cat("RF OOB R-squared:", round(rf$r.squared, 3), "\n")
## RF OOB R-squared: 0.316
# Plot top 15
p_lasso <- ggplot(head(lasso_df, 15),
aes(x = reorder(variable, abs(coefficient)), y = abs(coefficient))) +
geom_col(fill = "#2166AC", alpha = 0.8) +
coord_flip() +
labs(title = "Lasso: Top Predictors", x = "", y = "|Coefficient|")
p_rf <- ggplot(head(rf_imp, 15),
aes(x = reorder(variable, importance), y = importance)) +
geom_col(fill = "#B2182B", alpha = 0.8) +
coord_flip() +
labs(title = "Random Forest: Top Predictors", x = "", y = "Importance")
ggarrange(p_lasso, p_rf, ncol = 2)
# Compare models with and without study
set.seed(42)
rf_no_study <- ranger(protein ~ . - study, data = d_ml, num.trees = 500)
cat("RF with study: R2 =", round(rf$r.squared, 3), "\n")
## RF with study: R2 = 0.316
cat("RF without study: R2 =", round(rf_no_study$r.squared, 3), "\n")
## RF without study: R2 = 0.314
cv_lasso_no_study <- cv.glmnet(X[, !grepl("study", colnames(X))], y, alpha = 1, nfolds = 10)
r2_with <- 1 - cv_lasso$cvm[cv_lasso$lambda == cv_lasso$lambda.1se] / var(y)
r2_without <- 1 - cv_lasso_no_study$cvm[cv_lasso_no_study$lambda == cv_lasso_no_study$lambda.1se] / var(y)
cat("Lasso with study: R2 =", round(r2_with, 3), "\n")
## Lasso with study: R2 = 0.217
cat("Lasso without study: R2 =", round(r2_without, 3), "\n")
## Lasso without study: R2 = 0.222
anm_note("Study/site does not dominate protein prediction (R² barely changes without it). The top biological predictor after visit timing is maternal height -- the same variable that emerged in the causal forest. This convergence across ML methods reinforces that height captures something real about milk protein biology, even though it does not modify BEP effects.")
Interpretation: Study/site does not dominate the prediction of milk protein – removing it barely changes R2. The top biological predictors are visit timing (protein declines over lactation), maternal height (reflecting long-term nutritional status), BMI, and household wealth. No single variable emerges as a strong discriminator that was previously overlooked. The 35% R2 means most protein variance remains unexplained by measured baseline characteristics.
Mechanistic rationale: B vitamins are critical for mitochondrial metabolism and protein synthesis. If maternal B-vitamin deficiency limits the ability to convert amino acids into milk protein, BEP effects on protein might be larger in B-vitamin-replete women (or conversely, absent in deficient women).
# Use the composite B-vitamin variables (b1, b2, b3, b6, b12)
# Create a composite deficiency score: number of B-vitamins below the
# study-specific 25th percentile (proxy for EAR deficiency)
bvit_def_vars <- c("b1", "b2", "b3", "b6", "b12")
d_bvit <- d %>% filter(!is.na(protein))
# For each B-vitamin, flag if below 25th percentile within study
for (bv in bvit_def_vars) {
d_bvit <- d_bvit %>%
group_by(study) %>%
mutate(!!paste0(bv, "_low") := ifelse(.data[[bv]] < quantile(.data[[bv]], 0.25, na.rm = TRUE),
1, 0)) %>%
ungroup()
}
bvit_low_cols <- paste0(bvit_def_vars, "_low")
d_bvit <- d_bvit %>%
rowwise() %>%
mutate(bvit_deficiency_count = sum(c_across(all_of(bvit_low_cols)), na.rm = TRUE),
any_bvit_deficiency = ifelse(bvit_deficiency_count > 0, "Deficient", "Replete")) %>%
ungroup()
cat("B-vitamin deficiency distribution:\n")
## B-vitamin deficiency distribution:
print(table(d_bvit$study, d_bvit$bvit_deficiency_count))
##
## 0 1 2 3 4 5
## Misame 269 260 178 92 31 3
## Vital 108 81 58 39 10 4
cat("\nAny deficiency:\n")
##
## Any deficiency:
print(table(d_bvit$study, d_bvit$any_bvit_deficiency))
##
## Deficient Replete
## Misame 564 269
## Vital 192 108
# Continuous: deficiency count as modifier
bvit_int_results <- list()
for (s in c("Misame", "Vital")) {
ds <- d_bvit %>% filter(study == s, !is.na(bvit_deficiency_count))
# Continuous count
base_covars <- intersect(covars, names(ds))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.5)]
fml <- as.formula(paste0("protein ~ treatment * bvit_deficiency_count + ",
paste(base_covars, collapse = " + ")))
fit <- lm(fml, data = ds, na.action = na.omit)
ct <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = ds[complete.cases(ds[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
int_row <- grep(":", rownames(ct))
if (length(int_row) > 0) {
bvit_int_results[[paste("count", s)]] <- data.frame(
modifier = "B-vit deficiency count", study = s,
int_est = ct[int_row, 1], int_se = ct[int_row, 2], int_p = ct[int_row, 4], n = nobs(fit))
}
# Binary: any deficiency
ds$bvit_binary <- factor(ds$any_bvit_deficiency, levels = c("Replete", "Deficient"))
tab <- table(ds$bvit_binary, ds$treatment)
if (all(dim(tab) >= 2) && all(tab >= 3)) {
fml2 <- as.formula(paste0("protein ~ treatment * bvit_binary + ",
paste(base_covars, collapse = " + ")))
fit2 <- lm(fml2, data = ds, na.action = na.omit)
ct2 <- tryCatch({
vcov_cl <- vcovCL(fit2, cluster = ds[complete.cases(ds[, all.vars(fml2)]), "subjid"])
coeftest(fit2, vcov. = vcov_cl)
}, error = function(e) coeftest(fit2, vcov. = vcovHC(fit2, type = "HC1")))
int_row2 <- grep(":", rownames(ct2))
trt_row2 <- which(rownames(ct2) == "treatmentBEP")
if (length(int_row2) > 0) {
bvit_int_results[[paste("binary", s)]] <- data.frame(
modifier = "Any B-vit deficiency", study = s,
int_est = ct2[int_row2, 1], int_se = ct2[int_row2, 2], int_p = ct2[int_row2, 4], n = nobs(fit2))
}
}
}
bvit_int_df <- bind_rows(bvit_int_results)
bvit_int_df %>%
mutate(across(c(int_est, int_se), ~round(., 4)), int_p = round(int_p, 3)) %>%
kable(caption = "B-Vitamin Deficiency x Treatment Interaction on Milk Protein") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| modifier | study | int_est | int_se | int_p | n |
|---|---|---|---|---|---|
| B-vit deficiency count | Misame | 0.1162 | 0.0746 | 0.120 | 833 |
| Any B-vit deficiency | Misame | 0.1206 | 0.1781 | 0.499 | 833 |
| B-vit deficiency count | Vital | -0.1429 | 0.0928 | 0.125 | 300 |
| Any B-vit deficiency | Vital | -0.1486 | 0.2398 | 0.536 | 300 |
ggplot(d_bvit %>% mutate(bvit_group = ifelse(bvit_deficiency_count >= 2, "2+ deficient",
ifelse(bvit_deficiency_count == 1, "1 deficient", "0 deficient")),
bvit_group = factor(bvit_group, levels = c("0 deficient", "1 deficient", "2+ deficient"))),
aes(x = treatment, y = protein, fill = treatment)) +
geom_boxplot(alpha = 0.7, outlier.size = 0.8) +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3) +
facet_grid(study ~ bvit_group) +
labs(title = "Milk Protein by Treatment and B-Vitamin Deficiency Status",
x = "", y = "Protein (g/dL)") +
scale_fill_manual(values = trt_colors) +
theme(legend.position = "none")
anm_note("B-vitamin deficiency does not modify BEP effects on protein. However, there IS a significant B-vitamin deficiency x treatment interaction on fat and energy in Misame (see table below). This suggests B-vitamins may play a role in fat/energy metabolism even if not for protein synthesis specifically.")
bvit_macro_int <- list()
for (out in c("protein", "fat", "kcal.l")) {
for (s in c("Misame", "Vital")) {
ds <- d_bvit %>% filter(study == s, !is.na(.data[[out]]), !is.na(bvit_deficiency_count))
base_covars <- intersect(covars, names(ds))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.5)]
fml <- as.formula(paste0(out, " ~ treatment * bvit_deficiency_count + ",
paste(base_covars, collapse = " + ")))
fit <- tryCatch(lm(fml, data = ds, na.action = na.omit), error = function(e) NULL)
if (is.null(fit)) next
ct <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = ds[complete.cases(ds[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
int_row <- grep(":", rownames(ct))
if (length(int_row) > 0) {
bvit_macro_int[[paste(out, s)]] <- data.frame(
outcome = out, study = s,
int_est = ct[int_row, 1], int_p = ct[int_row, 4], n = nobs(fit))
}
}
}
bind_rows(bvit_macro_int) %>%
mutate(across(c(int_est, int_p), ~round(., 4))) %>%
kable(caption = "B-Vitamin Deficiency Count x Treatment: All Macronutrients") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| outcome | study | int_est | int_p | n |
|---|---|---|---|---|
| protein | Misame | 0.1162 | 0.1196 | 833 |
| protein | Vital | -0.1429 | 0.1247 | 300 |
| fat | Misame | 0.1896 | 0.0013 | 833 |
| fat | Vital | -0.0010 | 0.9918 | 300 |
| kcal.l | Misame | 0.1825 | 0.0022 | 833 |
| kcal.l | Vital | -0.0218 | 0.8284 | 300 |
We asked whether BEP affects amino acid composition, and whether these effects are modified by maternal nutritional status. We analyze all 20 free amino acids from the Biocrates targeted metabolomics panel, plus grouped summaries for essential AAs (His, Ile, Leu, Lys, Met, Phe, Thr, Trp, Val), non-essential AAs, and branched-chain AAs (Leu, Ile, Val).
Note on AA group summaries: Group means are computed as simple arithmetic averages of raw concentrations. Because individual AAs have different concentration scales, high-concentration AAs (e.g., glutamate) contribute more to the mean. This is a standard summary approach but should be interpreted alongside individual AA results.
aa_trt <- list()
for (aa in aa_all) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[aa]]))
res <- run_trt_effect(ds, aa, covars, s)
if (!is.null(res)) aa_trt[[paste(aa, s)]] <- res
}
}
aa_trt_df <- bind_rows(aa_trt) %>%
group_by(study) %>%
mutate(fdr_p = p.adjust(p, "BH")) %>%
ungroup() %>%
mutate(aa_label = aa_labels[outcome])
# Table shown in Appendix A below; forest plot follows
aa_trt_df <- aa_trt_df %>% mutate(sig_star = add_stars(p))
ggplot(aa_trt_df, aes(x = est, y = reorder(aa_label, est), color = study)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
geom_errorbarh(aes(xmin = ci_lo, xmax = ci_hi), height = 0.3, linewidth = 0.5,
position = position_dodge(0.6)) +
geom_point(size = 2, position = position_dodge(0.6)) +
geom_text(aes(x = ci_hi, label = sig_star), hjust = -0.3, size = 5, fontface = "bold",
position = position_dodge(0.6), show.legend = FALSE) +
labs(title = "Individual Amino Acid BEP Treatment Effects",
subtitle = "* p<0.05, ** p<0.01, *** p<0.001 (nominal)",
x = "Treatment Effect (95% CI)", y = "", color = "Study") +
scale_color_manual(values = study_colors) +
theme(legend.position = "top")
n_sig <- sum(aa_trt_df$p < 0.05)
sig_aas <- aa_trt_df %>% filter(p < 0.05) %>% arrange(p)
if (n_sig > 0) {
sig_text <- paste(sig_aas$aa_label, "(", sig_aas$study, ", p=", round(sig_aas$p, 3), ")", collapse = "; ", sep = "")
anm_note(paste0("There are ", n_sig, " nominally significant (p<0.05) individual AA treatment effects before FDR correction: ", sig_text, ". These should be interpreted cautiously given multiple testing across 20 AAs x 2 studies, but they provide potential leads for discussion. Notably, cysteine shows a significant effect in Misame that survives FDR correction, suggesting BEP may specifically affect sulfur amino acid metabolism."))
} else {
anm_note("No individual amino acid treatment effects reach nominal significance at p<0.05.")
}
aa_group_trt <- list()
for (grp in c("aa_essential", "aa_nonessential", "aa_bcaa")) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[grp]]))
res <- run_trt_effect(ds, grp, covars, s)
if (!is.null(res)) aa_group_trt[[paste(grp, s)]] <- res
}
}
aa_group_df <- bind_rows(aa_group_trt) %>%
group_by(study) %>%
mutate(fdr_p = p.adjust(p, "BH")) %>%
ungroup() %>%
mutate(outcome = recode(outcome,
aa_essential = "Essential AAs", aa_nonessential = "Non-essential AAs", aa_bcaa = "BCAAs"))
aa_group_df %>%
mutate(across(c(est, se, ci_lo, ci_hi), ~round(., 4)), p = round(p, 4), fdr_p = round(fdr_p, 4)) %>%
kable(caption = "AA Group Treatment Effects") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| outcome | study | est | se | p | ci_lo | ci_hi | n | fdr_p |
|---|---|---|---|---|---|---|---|---|
| Essential AAs | Misame | -0.0857 | 0.0834 | 0.3045 | -0.2491 | 0.0778 | 843 | 0.7617 |
| Essential AAs | Vital | 0.0046 | 0.1626 | 0.9773 | -0.3140 | 0.3233 | 300 | 0.9773 |
| Non-essential AAs | Misame | -0.0239 | 0.0787 | 0.7617 | -0.1782 | 0.1305 | 843 | 0.7617 |
| Non-essential AAs | Vital | 0.1061 | 0.1400 | 0.4492 | -0.1683 | 0.3805 | 300 | 0.9462 |
| BCAAs | Misame | -0.0367 | 0.0845 | 0.6640 | -0.2024 | 0.1289 | 843 | 0.7617 |
| BCAAs | Vital | 0.0740 | 0.1538 | 0.6308 | -0.2274 | 0.3754 | 300 | 0.9462 |
aa_em_cont <- list()
aa_em_binary <- list()
for (aa in aa_all) {
for (s in c("Misame", "Vital")) {
# BMI continuous
ds <- d %>% filter(study == s, !is.na(.data[[aa]]), !is.na(mbmi))
res <- run_interaction(ds, aa, "mbmi", covars, s)
if (!is.null(res)) aa_em_cont[[paste(aa, "mbmi", s)]] <- res
# MUAC continuous
ds_muac <- d %>% filter(study == s, !is.na(.data[[aa]]), !is.na(mmuaccm))
res_muac <- run_interaction(ds_muac, aa, "mmuaccm", covars, s)
if (!is.null(res_muac)) aa_em_cont[[paste(aa, "mmuaccm", s)]] <- res_muac
# Binary BMI < 18.5
res2 <- run_binary_interaction(ds, aa, "mbmi", 18.5, covars, s)
if (!is.null(res2)) aa_em_binary[[paste(aa, s)]] <- res2
}
}
aa_em_cont_df <- bind_rows(aa_em_cont) %>%
group_by(study) %>% mutate(fdr_p = p.adjust(int_p, "BH")) %>% ungroup() %>%
mutate(aa_label = aa_labels[outcome])
aa_em_binary_df <- bind_rows(aa_em_binary) %>%
group_by(study) %>% mutate(fdr_p = p.adjust(int_p, "BH")) %>% ungroup() %>%
mutate(aa_label = aa_labels[outcome])
cat("=== Continuous BMI x Treatment interaction: nominal p < 0.05 ===\n")
## === Continuous BMI x Treatment interaction: nominal p < 0.05 ===
aa_em_cont_df %>% filter(int_p < 0.05) %>%
mutate(across(where(is.numeric), ~round(., 4))) %>%
select(aa_label, study, int_est, int_p, fdr_p) %>% print()
## # A tibble: 0 × 5
## # ℹ 5 variables: aa_label <chr>, study <chr>, int_est <dbl>, int_p <dbl>,
## # fdr_p <dbl>
cat("\n=== BMI < 18.5 cutoff interaction: nominal p < 0.05 ===\n")
##
## === BMI < 18.5 cutoff interaction: nominal p < 0.05 ===
aa_em_binary_df %>% filter(int_p < 0.05) %>%
mutate(across(where(is.numeric), ~round(., 4))) %>%
select(aa_label, study, int_p, fdr_p, ate_above, ate_below) %>% print()
## # A tibble: 1 × 6
## aa_label study int_p fdr_p ate_above ate_below
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Threonine Misame 0.008 0.160 -0.162 0.415
aa_em_cont_bmi <- aa_em_cont_df %>% filter(modifier == "mbmi")
ggplot(aa_em_cont_bmi, aes(x = int_est, y = -log10(int_p), color = study)) +
geom_point(size = 3, alpha = 0.7) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey40") +
geom_text(data = aa_em_cont_bmi %>% filter(int_p < 0.15),
aes(label = aa_label), vjust = -0.8, size = 3.5, show.legend = FALSE) +
labs(title = "AA: BMI x Treatment Continuous Interaction",
x = "Interaction Estimate", y = "-log10(p)", color = "Study") +
scale_color_manual(values = study_colors) +
theme(legend.position = "top")
# Summary counts
n_bmi_sig <- sum(aa_em_cont_df$int_p < 0.05 & aa_em_cont_df$modifier == "mbmi")
n_muac_sig <- sum(aa_em_cont_df$int_p < 0.05 & aa_em_cont_df$modifier == "mmuaccm")
n_bmi_tests <- sum(aa_em_cont_df$modifier == "mbmi")
n_muac_tests <- sum(aa_em_cont_df$modifier == "mmuaccm")
cat(sprintf("BMI x treatment: %d/%d nominally significant (p < 0.05)\n", n_bmi_sig, n_bmi_tests))
## BMI x treatment: 0/40 nominally significant (p < 0.05)
cat(sprintf("MUAC x treatment: %d/%d nominally significant (p < 0.05)\n", n_muac_sig, n_muac_tests))
## MUAC x treatment: 0/40 nominally significant (p < 0.05)
aa_grp_em <- list()
for (grp in c("aa_essential", "aa_nonessential", "aa_bcaa")) {
for (mod in c("mbmi", "mmuaccm")) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[grp]]), !is.na(.data[[mod]]))
res <- run_interaction(ds, grp, mod, covars, s)
if (!is.null(res)) aa_grp_em[[paste(grp, mod, s)]] <- res
}
}
}
aa_grp_em_df <- bind_rows(aa_grp_em) %>%
mutate(outcome = recode(outcome,
aa_essential = "Essential AAs", aa_nonessential = "Non-essential AAs", aa_bcaa = "BCAAs"),
modifier = recode(modifier, mbmi = "BMI", mmuaccm = "MUAC"))
aa_grp_em_df %>%
mutate(across(c(int_est, int_se), ~round(., 4)), int_p = round(int_p, 3)) %>%
kable(caption = "AA Group: Continuous Interaction Tests") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| outcome | modifier | study | int_est | int_se | int_p | n |
|---|---|---|---|---|---|---|
| Essential AAs | BMI | Misame | -0.0401 | 0.0259 | 0.122 | 843 |
| Essential AAs | BMI | Vital | -0.0881 | 0.0947 | 0.353 | 300 |
| Essential AAs | MUAC | Misame | -0.0395 | 0.0275 | 0.151 | 843 |
| Essential AAs | MUAC | Vital | -0.1473 | 0.1202 | 0.221 | 300 |
| Non-essential AAs | BMI | Misame | -0.0191 | 0.0252 | 0.449 | 843 |
| Non-essential AAs | BMI | Vital | -0.1021 | 0.0971 | 0.294 | 300 |
| Non-essential AAs | MUAC | Misame | -0.0087 | 0.0249 | 0.727 | 843 |
| Non-essential AAs | MUAC | Vital | -0.0500 | 0.1120 | 0.656 | 300 |
| BCAAs | BMI | Misame | -0.0307 | 0.0289 | 0.289 | 843 |
| BCAAs | BMI | Vital | -0.0807 | 0.0951 | 0.397 | 300 |
| BCAAs | MUAC | Misame | -0.0195 | 0.0303 | 0.520 | 843 |
| BCAAs | MUAC | Vital | -0.1552 | 0.1252 | 0.216 | 300 |
anm_note("No individual amino acid or AA group shows a significant BMI or MUAC x treatment interaction. The null finding for effect modification extends from total protein down to the individual amino acid level -- maternal anthropometry simply does not modify BEP effects on any aspect of milk protein/amino acid composition within either study.")
The ratio of essential to non-essential amino acids in milk may indicate protein quality and metabolic efficiency. If BEP alters this ratio, it suggests the supplement affects not just total protein but its composition.
# Compute ratio from RAW (unscaled) values, then scale the ratio itself
d <- d %>%
mutate(eaa_neaa_ratio = aa_essential_raw / aa_nonessential_raw) %>%
mutate(eaa_neaa_ratio = ifelse(is.infinite(eaa_neaa_ratio) | is.nan(eaa_neaa_ratio),
NA, eaa_neaa_ratio))
# Treatment effect on ratio
ratio_trt <- list()
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(eaa_neaa_ratio))
res <- run_trt_effect(ds, "eaa_neaa_ratio", covars, s)
if (!is.null(res)) ratio_trt[[s]] <- res
}
ratio_trt_df <- bind_rows(ratio_trt)
if (nrow(ratio_trt_df) > 0) {
ratio_trt_df %>%
mutate(across(c(est, se, ci_lo, ci_hi), ~round(., 4)), p = round(p, 4)) %>%
kable(caption = "BEP Effect on Essential/Non-Essential AA Ratio") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
}
| outcome | study | est | se | p | ci_lo | ci_hi | n |
|---|---|---|---|---|---|---|---|
| eaa_neaa_ratio | Misame | -0.0029 | 0.0034 | 0.3847 | -0.0096 | 0.0037 | 843 |
| eaa_neaa_ratio | Vital | -0.0027 | 0.0049 | 0.5753 | -0.0122 | 0.0068 | 300 |
ggplot(d %>% filter(!is.na(eaa_neaa_ratio)),
aes(x = treatment, y = eaa_neaa_ratio, fill = treatment)) +
geom_boxplot(alpha = 0.7, outlier.size = 0.8) +
stat_summary(fun = mean, geom = "point", shape = 18, size = 3) +
facet_wrap(~study) +
labs(title = "Essential / Non-Essential AA Ratio by Treatment",
x = "", y = "EAA / NEAA Ratio") +
scale_fill_manual(values = trt_colors) +
theme(legend.position = "none")
# Does the ratio show effect modification by BMI?
ratio_em <- list()
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(eaa_neaa_ratio), !is.na(mbmi))
res <- run_interaction(ds, "eaa_neaa_ratio", "mbmi", covars, s)
if (!is.null(res)) ratio_em[[s]] <- res
}
if (length(ratio_em) > 0) {
bind_rows(ratio_em) %>%
mutate(across(c(int_est, int_se), ~round(., 4)), int_p = round(int_p, 3)) %>%
kable(caption = "EAA/NEAA Ratio: BMI x Treatment Interaction") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
}
| outcome | modifier | study | int_est | int_se | int_p | n |
|---|---|---|---|---|---|---|
| eaa_neaa_ratio | mbmi | Misame | -0.0014 | 0.0014 | 0.333 | 843 |
| eaa_neaa_ratio | mbmi | Vital | -0.0007 | 0.0038 | 0.846 | 300 |
anm_note("The EAA/NEAA ratio is not significantly affected by BEP in either study. This suggests BEP does not selectively alter the essential amino acid profile of milk -- the supplement's effect (where present) appears to be on total protein quantity rather than amino acid quality.")
We specifically examined BEP effects on beta-casein, alpha-lactalbumin, lactoferrin, and immunoglobulins. These are available from the untargeted DIANN proteomics platform (identified by UniProt accession codes). The proteomics subsample is smaller than the main dataset since proteomics was run on a subset of milk samples.
# Load untargeted proteomics and extract key proteins
library(stringr)
load_proteomics <- function(study_name, file_path) {
prot <- read.csv(file_path)
colnames(prot)[1] <- "bmid"
if (study_name == "Vital") {
prot$bmid <- str_extract(prot$bmid, "A\\d+-LW")
}
prot <- prot %>% mutate(study = study_name)
# Extract key proteins by UniProt ID
key_ids <- c(P02788 = "lactoferrin", P00709 = "alpha_lactalbumin",
P05814 = "beta_casein", P07498 = "kappa_casein",
P10451 = "osteopontin", P01876 = "IgA1")
# Convert high-missingness columns to NA (consistent with main pipeline)
# Columns with >50% NA are flagged
for (col in setdiff(names(prot), c("bmid", "study"))) {
if (is.numeric(prot[[col]]) && mean(is.na(prot[[col]])) > 0.5) {
prot[[col]] <- NA_real_
}
}
available <- intersect(names(key_ids), names(prot))
if (length(available) > 0) {
prot_key <- prot %>% select(study, bmid, all_of(available))
# Rename to protein names
for (uid in available) {
names(prot_key)[names(prot_key) == uid] <- key_ids[uid]
}
return(prot_key)
}
return(NULL)
}
prot_vital <- load_proteomics("Vital", paste0(here::here(), "/data/milk/PBL_VITAL_L_DIANN_Protein.csv"))
prot_misame <- load_proteomics("Misame", paste0(here::here(), "/data/milk/PBL_MISAME_DIANN_Protein.csv"))
prot_all <- bind_rows(prot_vital, prot_misame)
# Merge with main data
d_prot <- d %>%
select(study, subjid, bmid, visit, treatment, mbmi, mmuaccm, all_of(covars)) %>%
inner_join(prot_all, by = c("study", "bmid"))
# Scale proteomics outcomes within study (same approach as main outcomes)
milk_proteins <- c("lactoferrin", "alpha_lactalbumin", "beta_casein",
"kappa_casein", "osteopontin", "IgA1")
prot_outcome_vars <- intersect(milk_proteins, names(d_prot))
d_prot <- d_prot %>%
group_by(study) %>%
mutate(across(all_of(prot_outcome_vars), ~as.numeric(scale(.)))) %>%
ungroup()
cat("Proteomics merged and scaled: n =", nrow(d_prot), "\n")
## Proteomics merged and scaled: n = 256
print(table(d_prot$study, d_prot$treatment))
##
## Control BEP
## Misame 50 56
## Vital 50 100
milk_proteins <- c("lactoferrin", "alpha_lactalbumin", "beta_casein",
"kappa_casein", "osteopontin", "IgA1")
prot_avail <- d_prot %>%
group_by(study) %>%
summarise(across(all_of(milk_proteins),
~sum(!is.na(.)), .names = "n_{.col}"),
total = n(), .groups = "drop")
prot_avail %>%
kable(caption = "Major Milk Protein Data Availability (observations with non-NA values)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| study | n_lactoferrin | n_alpha_lactalbumin | n_beta_casein | n_kappa_casein | n_osteopontin | n_IgA1 | total |
|---|---|---|---|---|---|---|---|
| Misame | 106 | 106 | 106 | 106 | 106 | 106 | 106 |
| Vital | 150 | 150 | 150 | 150 | 150 | 150 | 150 |
Note: Beta-casein, alpha-lactalbumin, lactoferrin, kappa-casein, osteopontin, and IgA1 are all available from the untargeted DIANN proteomics platform (UniProt identifiers). The targeted MSD panel also measured IgA separately.
prot_trt <- list()
for (prot in milk_proteins) {
for (s in c("Misame", "Vital")) {
ds <- d_prot %>% filter(study == s, !is.na(.data[[prot]]))
res <- run_trt_effect(ds, prot, covars, s)
if (!is.null(res)) prot_trt[[paste(prot, s)]] <- res
}
}
prot_trt_df <- bind_rows(prot_trt)
# Treatment effect table moved to Appendix A; forest plot follows
if (nrow(prot_trt_df) > 0) {
prot_trt_df <- prot_trt_df %>% mutate(sig_star = add_stars(p))
ggplot(prot_trt_df, aes(x = est, y = reorder(outcome, est), color = study)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
geom_errorbarh(aes(xmin = ci_lo, xmax = ci_hi), height = 0.3, linewidth = 0.6,
position = position_dodge(0.5)) +
geom_point(size = 3, position = position_dodge(0.5)) +
geom_text(aes(x = ci_hi, label = sig_star), hjust = -0.3, size = 5, fontface = "bold",
position = position_dodge(0.5), show.legend = FALSE) +
facet_wrap(~study, scales = "free_x") +
labs(title = "Major Milk Protein BEP Treatment Effects",
subtitle = "SD units; * p<0.05",
x = "Treatment Effect (95% CI, SD units)", y = "", color = "Study") +
scale_color_manual(values = study_colors) +
theme(legend.position = "none")
}
prot_em <- list()
for (prot in milk_proteins) {
for (mod in c("mbmi", "mmuaccm")) {
for (s in c("Misame", "Vital")) {
ds <- d_prot %>% filter(study == s, !is.na(.data[[prot]]), !is.na(.data[[mod]]))
res <- run_interaction(ds, prot, mod, covars, s)
if (!is.null(res)) prot_em[[paste(prot, mod, s)]] <- res
}
}
}
prot_em_df <- bind_rows(prot_em) %>%
mutate(modifier = recode(modifier, mbmi = "BMI", mmuaccm = "MUAC"))
prot_em_df %>%
mutate(across(c(int_est, int_se), ~round(., 2)), int_p = round(int_p, 3)) %>%
select(outcome, modifier, study, int_est, int_se, int_p, n) %>%
kable(caption = "Major Milk Proteins: Continuous Interaction Tests") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| outcome | modifier | study | int_est | int_se | int_p | n |
|---|---|---|---|---|---|---|
| lactoferrin | BMI | Misame | 0.11 | 0.06 | 0.086 | 106 |
| lactoferrin | BMI | Vital | 0.05 | 0.11 | 0.667 | 150 |
| lactoferrin | MUAC | Misame | 0.11 | 0.07 | 0.096 | 106 |
| lactoferrin | MUAC | Vital | 0.02 | 0.12 | 0.901 | 150 |
| alpha_lactalbumin | BMI | Misame | 0.02 | 0.06 | 0.783 | 106 |
| alpha_lactalbumin | BMI | Vital | 0.05 | 0.12 | 0.695 | 150 |
| alpha_lactalbumin | MUAC | Misame | 0.04 | 0.07 | 0.593 | 106 |
| alpha_lactalbumin | MUAC | Vital | 0.05 | 0.14 | 0.750 | 150 |
| beta_casein | BMI | Misame | 0.04 | 0.06 | 0.543 | 106 |
| beta_casein | BMI | Vital | 0.05 | 0.14 | 0.705 | 150 |
| beta_casein | MUAC | Misame | 0.03 | 0.06 | 0.564 | 106 |
| beta_casein | MUAC | Vital | 0.22 | 0.16 | 0.169 | 150 |
| kappa_casein | BMI | Misame | 0.00 | 0.06 | 0.986 | 106 |
| kappa_casein | BMI | Vital | 0.11 | 0.12 | 0.366 | 150 |
| kappa_casein | MUAC | Misame | 0.00 | 0.07 | 0.976 | 106 |
| kappa_casein | MUAC | Vital | 0.19 | 0.17 | 0.268 | 150 |
| osteopontin | BMI | Misame | -0.08 | 0.05 | 0.148 | 106 |
| osteopontin | BMI | Vital | 0.15 | 0.10 | 0.144 | 150 |
| osteopontin | MUAC | Misame | -0.02 | 0.06 | 0.686 | 106 |
| osteopontin | MUAC | Vital | 0.05 | 0.13 | 0.687 | 150 |
| IgA1 | BMI | Misame | -0.01 | 0.06 | 0.922 | 106 |
| IgA1 | BMI | Vital | 0.12 | 0.14 | 0.392 | 150 |
| IgA1 | MUAC | Misame | 0.02 | 0.06 | 0.722 | 106 |
| IgA1 | MUAC | Vital | 0.16 | 0.13 | 0.239 | 150 |
anm_note("Alpha-lactalbumin is the only major milk protein with a nominally significant BEP effect (Vital only, p=0.022). Alpha-lactalbumin is a major whey protein that regulates lactose synthesis and is nutritionally sensitive. This finding is biologically plausible but needs verification against batch effects in the proteomics data. No anthropometry x treatment interactions for any major milk protein.")
The MOMI SGA analyses used AGA as the comparator (not all non-SGA), and also ran a “healthy AGA” sensitivity analysis excluding other adverse pregnancy outcomes. We replicate this approach here.
growth_full <- read.csv(paste0(here::here(), "/data/imic_full_growth_outcomes_dataset.csv"))
growth_ref <- growth_full %>%
filter(studyid %in% c("MISAME-3", "VITAL-Lactation")) %>%
mutate(subjid = as.character(subjid),
study = recode(studyid, "MISAME-3" = "Misame", "VITAL-Lactation" = "Vital")) %>%
filter(!is.na(sga), !is.na(waz_GA)) %>%
mutate(
sga10 = ifelse(sga == 1, 1, 0),
sga3 = ifelse(waz_GA < 3, 1, 0),
lga = ifelse(waz_GA > 90, 1, 0),
aga = ifelse(sga == 0 & lga == 0, 1, 0),
# Healthy AGA: AGA + not preterm + not LBW
healthy_aga = ifelse(aga == 1 & (is.na(preterm) | preterm == 0) &
(is.na(lbw) | lbw == 0), 1, 0)
) %>%
select(subjid, study, sga10, sga3, lga, aga, healthy_aga, waz_GA, preterm, lbw) %>%
distinct()
cat("=== SGA/AGA/LGA Distribution ===\n")
## === SGA/AGA/LGA Distribution ===
for (s in c("Misame", "Vital")) {
gs <- growth_ref %>% filter(study == s)
cat(sprintf("\n%s: SGA10=%d, AGA=%d, LGA=%d, healthy_AGA=%d, SGA3=%d\n",
s, sum(gs$sga10), sum(gs$aga), sum(gs$lga), sum(gs$healthy_aga), sum(gs$sga3)))
}
##
## Misame: SGA10=73, AGA=211, LGA=5, healthy_AGA=203, SGA3=31
##
## Vital: SGA10=57, AGA=90, LGA=2, healthy_AGA=78, SGA3=30
# Merge with milk data
d_sga_ref <- d %>% inner_join(growth_ref, by = c("subjid", "study"))
# SGA10 vs AGA (exclude LGA)
d_sga_aga <- d_sga_ref %>% filter(sga10 == 1 | aga == 1) %>%
mutate(sga_group = factor(ifelse(sga10 == 1, "SGA10", "AGA"), levels = c("AGA", "SGA10")))
cat("\nSGA10 vs AGA sample:\n")
##
## SGA10 vs AGA sample:
print(table(d_sga_aga$study, d_sga_aga$sga_group))
##
## AGA SGA10
## Misame 616 212
## Vital 180 114
sga_aga_aa <- list()
for (aa in aa_all) {
for (s in c("Misame", "Vital")) {
ds <- d_sga_aga %>% filter(study == s, !is.na(.data[[aa]]))
if (nrow(ds) < 20) next
base_covars <- intersect(c("treatment", covars), names(ds))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.5)]
fml <- as.formula(paste0("`", aa, "` ~ sga_group + ", paste(base_covars, collapse = " + ")))
fit <- tryCatch(lm(fml, data = ds, na.action = na.omit), error = function(e) NULL)
if (is.null(fit)) next
ct <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = ds[complete.cases(ds[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
sga_row <- which(rownames(ct) == "sga_groupSGA10")
if (length(sga_row) > 0) {
sga_aga_aa[[paste(aa, s)]] <- data.frame(
outcome = aa, aa_label = aa_labels[aa], study = s,
est = ct[sga_row, 1], se = ct[sga_row, 2], p = ct[sga_row, 4], n = nobs(fit))
}
}
}
sga_aga_aa_df <- bind_rows(sga_aga_aa) %>%
group_by(study) %>% mutate(fdr_p = p.adjust(p, "BH")) %>% ungroup()
sga_aga_aa_df %>%
mutate(across(c(est, se), ~round(., 4)), p = round(p, 4), fdr_p = round(fdr_p, 4),
sig = add_stars(p),
highlight = ifelse(outcome %in% c("gln", "ser"), "**", "")) %>%
arrange(study, p) %>%
select(aa_label, study, est, se, p, fdr_p, sig, n) %>%
datatable(caption = "SGA10 vs AGA: Individual AA Associations",
filter = "top", rownames = FALSE,
options = list(pageLength = 15, order = list(list(4, "asc")))) %>%
formatStyle("p", backgroundColor = styleInterval(c(0.01, 0.05, 0.10),
c("#FDAE61", "#FEE08B", "#FFFFBF", "white")))
d_sga_healthy <- d_sga_ref %>% filter(sga10 == 1 | healthy_aga == 1) %>%
mutate(sga_group = factor(ifelse(sga10 == 1, "SGA10", "Healthy AGA"),
levels = c("Healthy AGA", "SGA10")))
cat("SGA10 vs Healthy AGA sample:\n")
## SGA10 vs Healthy AGA sample:
print(table(d_sga_healthy$study, d_sga_healthy$sga_group))
##
## Healthy AGA SGA10
## Misame 596 212
## Vital 156 114
sga_healthy_aa <- list()
for (aa in aa_all) {
for (s in c("Misame", "Vital")) {
ds <- d_sga_healthy %>% filter(study == s, !is.na(.data[[aa]]))
if (nrow(ds) < 20) next
base_covars <- intersect(c("treatment", covars), names(ds))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.5)]
fml <- as.formula(paste0("`", aa, "` ~ sga_group + ", paste(base_covars, collapse = " + ")))
fit <- tryCatch(lm(fml, data = ds, na.action = na.omit), error = function(e) NULL)
if (is.null(fit)) next
ct <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = ds[complete.cases(ds[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
sga_row <- which(rownames(ct) == "sga_groupSGA10")
if (length(sga_row) > 0) {
sga_healthy_aa[[paste(aa, s)]] <- data.frame(
outcome = aa, aa_label = aa_labels[aa], study = s,
est = ct[sga_row, 1], se = ct[sga_row, 2], p = ct[sga_row, 4], n = nobs(fit))
}
}
}
sga_healthy_aa_df <- bind_rows(sga_healthy_aa) %>%
group_by(study) %>% mutate(fdr_p = p.adjust(p, "BH")) %>% ungroup()
sga_healthy_aa_df %>%
mutate(across(c(est, se), ~round(., 4)), p = round(p, 4), fdr_p = round(fdr_p, 4),
sig = add_stars(p)) %>%
arrange(study, p) %>%
select(aa_label, study, est, se, p, fdr_p, sig, n) %>%
datatable(caption = "SGA10 vs Healthy AGA: Individual AA Associations",
filter = "top", rownames = FALSE,
options = list(pageLength = 15, order = list(list(4, "asc")))) %>%
formatStyle("p", backgroundColor = styleInterval(c(0.01, 0.05, 0.10),
c("#FDAE61", "#FEE08B", "#FFFFBF", "white")))
anm_note("SGA10 infants have lower milk glutamine in Misame (p=0.031 vs AGA), but this does not replicate in Vital. Serine is not significant in either study. The glutamine signal is partially consistent with the MOMI finding of prenatal amino acid depletion in SGA mothers, but the postnatal milk context is biologically distinct. The healthy-AGA sensitivity analysis yields similar results, suggesting the finding is not driven by other adverse pregnancy outcomes in the comparator group.")
# Check SGA3 sample sizes
sga3_tab <- d_sga_ref %>% filter(!is.na(sga3)) %>%
group_by(study) %>% summarise(n_sga3 = sum(sga3), n_total = n(), .groups = "drop")
cat("SGA3 counts:\n"); print(sga3_tab)
## SGA3 counts:
## # A tibble: 2 × 3
## study n_sga3 n_total
## <chr> <dbl> <int>
## 1 Misame 92 842
## 2 Vital 60 298
# Only run if adequate sample
sga3_results <- list()
for (s in c("Misame", "Vital")) {
ds <- d_sga_ref %>% filter(study == s, (sga3 == 1 | aga == 1), !is.na(sga3))
ds$sga3_group <- factor(ifelse(ds$sga3 == 1, "SGA3", "AGA"), levels = c("AGA", "SGA3"))
tab <- table(ds$sga3_group)
if (min(tab) < 10) {
cat(sprintf("%s: SGA3 n=%d -- too small for reliable analysis\n", s, tab["SGA3"]))
next
}
for (aa in c("gln", "ser", "protein")) {
if (!aa %in% names(ds) || sum(!is.na(ds[[aa]])) < 20) next
base_covars <- intersect(c("treatment", covars), names(ds))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.5)]
fml <- as.formula(paste0("`", aa, "` ~ sga3_group + ", paste(base_covars, collapse = " + ")))
fit <- tryCatch(lm(fml, data = ds, na.action = na.omit), error = function(e) NULL)
if (is.null(fit)) next
ct <- tryCatch(coeftest(fit, vcov. = vcovHC(fit, type = "HC1")), error = function(e) NULL)
if (is.null(ct)) next
sga_row <- which(rownames(ct) == "sga3_groupSGA3")
if (length(sga_row) > 0) {
sga3_results[[paste(aa, s)]] <- data.frame(
outcome = aa, study = s, est = ct[sga_row, 1], p = ct[sga_row, 4], n = nobs(fit))
}
}
}
if (length(sga3_results) > 0) {
bind_rows(sga3_results) %>%
mutate(across(c(est, p), ~round(., 4))) %>%
kable(caption = "SGA3 vs AGA: Key Outcomes (if feasible)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
} else {
cat("SGA3 sample sizes too small for reliable analysis in both studies.\n")
}
| outcome | study | est | p | n |
|---|---|---|---|---|
| gln | Misame | -0.1058 | 0.3202 | 706 |
| ser | Misame | -0.1008 | 0.3959 | 706 |
| protein | Misame | -0.0344 | 0.8066 | 700 |
| gln | Vital | -0.2804 | 0.0825 | 240 |
| ser | Vital | -0.1867 | 0.2487 | 240 |
| protein | Vital | 0.1318 | 0.4444 | 240 |
The main SGA analyses adjust for treatment arm. As a sensitivity analysis, we repeat the analysis subsetting to the control arm only, eliminating any concern that BEP could mediate the SGA-milk relationship.
d_sga_ctrl <- d_sga_aga %>% filter(treatment == "Control")
cat("Control-only SGA sample:\n")
## Control-only SGA sample:
print(table(d_sga_ctrl$study, d_sga_ctrl$sga_group))
##
## AGA SGA10
## Misame 291 94
## Vital 56 42
sga_ctrl_aa <- list()
for (aa in aa_all) {
for (s in c("Misame", "Vital")) {
ds <- d_sga_ctrl %>% filter(study == s, !is.na(.data[[aa]]))
if (nrow(ds) < 15) next
base_covars <- intersect(covars, names(ds))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.5)]
fml <- as.formula(paste0("`", aa, "` ~ sga_group + ", paste(base_covars, collapse = " + ")))
fit <- tryCatch(lm(fml, data = ds, na.action = na.omit), error = function(e) NULL)
if (is.null(fit) || nobs(fit) < 10) next
ct <- tryCatch(coeftest(fit, vcov. = vcovHC(fit, type = "HC1")), error = function(e) NULL)
if (is.null(ct)) next
sga_row <- which(rownames(ct) == "sga_groupSGA10")
if (length(sga_row) > 0) {
sga_ctrl_aa[[paste(aa, s)]] <- data.frame(
outcome = aa, aa_label = aa_labels[aa], study = s,
est = ct[sga_row, 1], se = ct[sga_row, 2], p = ct[sga_row, 4], n = nobs(fit))
}
}
}
sga_ctrl_df <- bind_rows(sga_ctrl_aa)
if (nrow(sga_ctrl_df) > 0) {
sga_ctrl_df %>%
mutate(across(c(est, se), ~round(., 4)), p = round(p, 4),
highlight = ifelse(outcome %in% c("gln", "ser"), "**", "")) %>%
arrange(study, p) %>%
filter(p < 0.20 | outcome %in% c("gln", "ser")) %>%
select(aa_label, study, est, se, p, n, highlight) %>%
kable(caption = "SGA10 vs AGA (Control Arm Only): Top AA Associations") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE, font_size = 12)
}
| aa_label | study | est | se | p | n | highlight | |
|---|---|---|---|---|---|---|---|
| asp | Aspartate | Misame | -0.2005 | 0.0996 | 0.0448 | 384 | |
| phe | Phenylalanine | Misame | 0.2691 | 0.1344 | 0.0460 | 384 | |
| met | Methionine | Misame | 0.3028 | 0.1626 | 0.0633 | 384 | |
| leu | Leucine | Misame | 0.3207 | 0.1916 | 0.0951 | 384 | |
| trp_bio…5 | Tryptophan | Misame | 0.3792 | 0.2426 | 0.1189 | 384 | |
| gly | Glycine | Misame | 0.2287 | 0.1485 | 0.1244 | 384 | |
| ile | Isoleucine | Misame | 0.2637 | 0.1846 | 0.1540 | 384 | |
| ala | Alanine | Misame | 0.1713 | 0.1202 | 0.1550 | 384 | |
| lys | Lysine | Misame | 0.2302 | 0.1750 | 0.1893 | 384 | |
| ser…10 | Serine | Misame | 0.0800 | 0.1235 | 0.5179 | 384 | ** |
| gln…11 | Glutamine | Misame | -0.0694 | 0.1110 | 0.5323 | 384 | ** |
| cys | Cysteine | Vital | -0.4618 | 0.1759 | 0.0102 | 98 | |
| ser…13 | Serine | Vital | -0.4532 | 0.1940 | 0.0218 | 98 | ** |
| gln…14 | Glutamine | Vital | -0.3756 | 0.1672 | 0.0272 | 98 | ** |
| thr | Threonine | Vital | -0.4378 | 0.2425 | 0.0745 | 98 | |
| glu | Glutamate | Vital | -0.3248 | 0.2271 | 0.1561 | 98 | |
| trp_bio…17 | Tryptophan | Vital | -0.2345 | 0.1760 | 0.1862 | 98 |
anm_note("The control-only sensitivity analysis has reduced power but tests whether the SGA-glutamine association is independent of BEP treatment. If the direction and approximate magnitude are consistent with the full-sample adjusted analysis, this supports a genuine SGA-milk composition link rather than a BEP-mediated artifact.")
ggplot(sga_aga_aa_df, aes(x = est, y = -log10(p), color = study)) +
geom_point(size = 3, alpha = 0.7) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "grey40") +
geom_text(data = sga_aga_aa_df %>% filter(p < 0.15 | outcome %in% c("gln","ser")),
aes(label = aa_label), vjust = -0.8, size = 3.5, show.legend = FALSE) +
labs(title = "SGA10 vs AGA: Amino Acid Associations",
x = "SGA10 vs AGA Difference", y = "-log10(p)", color = "Study") +
scale_color_manual(values = study_colors) +
theme(legend.position = "top")
Interpretation: Using the AGA comparator (as in the MOMI protocol), the glutamine signal in Misame persists (p=0.031 for SGA10 vs AGA, direction: lower glutamine in SGA). The healthy-AGA sensitivity analysis yields similar results. However, this finding does not replicate in Vital/Mumta-LW. Serine is not significantly associated with SGA in either study. The postnatal milk context differs from prenatal blood (where MOMI found depletion), so partial replication of one AA out of two is not strong evidence of a shared mechanism.
Team discussion flagged that leucine accumulation in milk may be negatively linked to protein levels, suggesting that even with adequate amino acid availability, other factors (e.g., B-vitamins) may limit protein synthesis.
ggplot(d %>% filter(!is.na(protein), !is.na(leu)),
aes(x = leu, y = protein, color = treatment)) +
geom_point(alpha = 0.3, size = 1.5) +
geom_smooth(method = "lm", se = TRUE, linewidth = 1) +
facet_wrap(~study) +
labs(title = "Leucine vs. Milk Protein by Treatment",
x = "Leucine (uM)", y = "Protein (g/dL)", color = "Treatment") +
scale_color_manual(values = trt_colors) +
theme(legend.position = "top")
leu_cor <- list()
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(protein), !is.na(leu))
ct <- cor.test(ds$protein, ds$leu, method = "pearson")
leu_cor[[s]] <- data.frame(study = s, r = ct$estimate, p = ct$p.value, n = nrow(ds))
# Test if BEP modifies the leu-protein relationship
base_covars <- intersect(covars, names(ds))
base_covars <- base_covars[sapply(base_covars, function(v) sum(is.na(ds[[v]])) < nrow(ds) * 0.5)]
fml <- as.formula(paste0("protein ~ treatment * leu + ", paste(base_covars, collapse = " + ")))
fit <- lm(fml, data = ds, na.action = na.omit)
ct2 <- tryCatch({
vcov_cl <- vcovCL(fit, cluster = ds[complete.cases(ds[, all.vars(fml)]), "subjid"])
coeftest(fit, vcov. = vcov_cl)
}, error = function(e) coeftest(fit, vcov. = vcovHC(fit, type = "HC1")))
int_row <- grep(":", rownames(ct2))
cat(sprintf("%s: leu-protein r=%.3f (p=%.4f); treatment x leu interaction p=%.4f\n",
s, ct$estimate, ct$p.value, ct2[int_row, 4]))
}
## Misame: leu-protein r=0.194 (p=0.0000); treatment x leu interaction p=0.0276
## Vital: leu-protein r=0.285 (p=0.0000); treatment x leu interaction p=0.8346
bind_rows(leu_cor) %>%
mutate(across(c(r, p), ~round(., 4))) %>%
kable(caption = "Leucine-Protein Correlation by Study") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| study | r | p | n | |
|---|---|---|---|---|
| cor…1 | Misame | 0.1939 | 0 | 833 |
| cor…2 | Vital | 0.2852 | 0 | 300 |
anm_note("Contrary to the hypothesis from team discussion, leucine and protein are positively correlated (not negatively). Higher free leucine tracks with higher total protein, suggesting leucine is not accumulating due to impaired protein synthesis. BEP does not modify this relationship.")
The main manuscript noted a transient BEP effect in Vital at the first timepoint. We examine treatment effects at each visit separately to characterize this pattern and test whether it appears for amino acids as well.
visit_trt <- list()
for (out in c("protein", "fat", "kcal.l")) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[out]]))
for (v in sort(unique(ds$visit))) {
dsv <- ds %>% filter(visit == v)
if (nrow(dsv) < 20 || length(unique(dsv$treatment)) < 2) next
res <- run_trt_effect(dsv, out, covars, s)
if (!is.null(res)) {
res$visit <- v
visit_trt[[paste(out, s, v)]] <- res
}
}
}
}
visit_trt_df <- bind_rows(visit_trt)
visit_trt_df %>%
mutate(across(c(est, se, ci_lo, ci_hi), ~round(., 4)), p = round(p, 4),
sig = add_stars(p)) %>%
arrange(outcome, study, visit) %>%
datatable(caption = "Macronutrient Treatment Effects by Study and Visit",
rownames = FALSE,
options = list(pageLength = 20, order = list(list(6, "asc")))) %>%
formatStyle("p", backgroundColor = styleInterval(c(0.01, 0.05, 0.10),
c("#FDAE61", "#FEE08B", "#FFFFBF", "white")))
if (nrow(visit_trt_df) > 0) {
visit_trt_df$visit_label <- paste0("Visit ", visit_trt_df$visit)
visit_trt_df <- visit_trt_df %>% mutate(sig_star = add_stars(p))
ggplot(visit_trt_df, aes(x = est, y = visit_label, color = study)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
geom_errorbarh(aes(xmin = ci_lo, xmax = ci_hi), height = 0.2, linewidth = 0.7,
position = position_dodge(0.4)) +
geom_point(size = 3, position = position_dodge(0.4)) +
geom_text(aes(x = ci_hi, label = sig_star), hjust = -0.3, size = 5, fontface = "bold",
position = position_dodge(0.4), show.legend = FALSE) +
facet_wrap(~outcome, scales = "free_x", ncol = 1) +
labs(title = "BEP Treatment Effects by Visit",
subtitle = "SD units; * p<0.05, ** p<0.01",
x = "Treatment Effect (95% CI, SD units)", y = "", color = "Study") +
scale_color_manual(values = study_colors) +
theme(legend.position = "top")
}
# Focus on Vital visit 40 where macro effects were strongest
aa_v40 <- list()
ds_v40 <- d %>% filter(study == "Vital", visit == 40)
if (nrow(ds_v40) > 20 && length(unique(ds_v40$treatment)) == 2) {
for (aa in aa_all) {
dsv <- ds_v40 %>% filter(!is.na(.data[[aa]]))
res <- run_trt_effect(dsv, aa, covars, "Vital")
if (!is.null(res)) {
res$visit <- 40
aa_v40[[aa]] <- res
}
}
}
aa_v40_df <- bind_rows(aa_v40) %>%
mutate(aa_label = aa_labels[outcome]) %>%
group_by() %>% mutate(fdr_p = p.adjust(p, "BH")) %>% ungroup()
if (nrow(aa_v40_df) > 0) {
aa_v40_df %>%
mutate(across(c(est, se, ci_lo, ci_hi), ~round(., 4)),
p = round(p, 4), fdr_p = round(fdr_p, 4),
sig = add_stars(p)) %>%
arrange(p) %>%
select(aa_label, est, ci_lo, ci_hi, p, fdr_p, sig, n) %>%
datatable(caption = "AA Treatment Effects at Vital Day 40",
rownames = FALSE,
options = list(pageLength = 20, order = list(list(4, "asc")))) %>%
formatStyle("p", backgroundColor = styleInterval(c(0.01, 0.05, 0.10),
c("#FDAE61", "#FEE08B", "#FFFFBF", "white")))
}
anm_note("The visit-specific results confirm the transient nature of BEP effects in Vital/Mumta-LW: protein and energy effects are concentrated at day 40 and absent by day 56. This could reflect (1) an early-lactation sensitivity window, (2) a dietary novelty response to chickpea-based BEP in a low-pulse-consuming population (analogous to the transient triglyceride effect in Misame), or (3) regression to the mean. The fact that this pattern appears across multiple labs and analytes argues against pure measurement artifact, but it remains a single-timepoint finding.")
For Vital (2 visits 2 weeks apart) and Misame (3 visits), we examine whether BEP affects the change in protein over time.
# Calculate within-subject change
d_wide <- d %>%
filter(!is.na(protein)) %>%
select(study, subjid, visit, treatment, protein) %>%
group_by(study, subjid, treatment) %>%
arrange(visit) %>%
mutate(visit_order = row_number()) %>%
ungroup()
# Vital: visit 40 -> 56
vital_vel <- d_wide %>% filter(study == "Vital") %>%
select(subjid, treatment, visit, protein) %>%
pivot_wider(names_from = visit, values_from = protein, names_prefix = "v") %>%
mutate(delta_protein = v56 - v40) %>%
filter(!is.na(delta_protein))
# Misame: visit 1 -> 2
misame_v12 <- d_wide %>% filter(study == "Misame", visit %in% c(1,2)) %>%
select(subjid, treatment, visit, protein) %>%
pivot_wider(names_from = visit, values_from = protein, names_prefix = "v") %>%
mutate(delta_protein = v2 - v1) %>%
filter(!is.na(delta_protein))
# Misame: visit 2 -> 3
misame_v23 <- d_wide %>% filter(study == "Misame", visit %in% c(2,3)) %>%
select(subjid, treatment, visit, protein) %>%
pivot_wider(names_from = visit, values_from = protein, names_prefix = "v") %>%
mutate(delta_protein = v3 - v2) %>%
filter(!is.na(delta_protein))
cat("Vital velocity (day 40 -> 56): n =", nrow(vital_vel), "\n")
## Vital velocity (day 40 -> 56): n = 150
cat("Misame velocity (visit 1 -> 2): n =", nrow(misame_v12), "\n")
## Misame velocity (visit 1 -> 2): n = 267
cat("Misame velocity (visit 2 -> 3): n =", nrow(misame_v23), "\n")
## Misame velocity (visit 2 -> 3): n = 273
# Test treatment effect on delta
for (label in c("Vital (40->56)", "Misame (1->2)", "Misame (2->3)")) {
ds <- if (grepl("Vital", label)) vital_vel else if (grepl("1->2", label)) misame_v12 else misame_v23
tt <- t.test(delta_protein ~ treatment, data = ds)
cat(sprintf("\n%s: Control delta=%.4f, BEP delta=%.4f, diff p=%.4f\n",
label, tt$estimate[1], tt$estimate[2], tt$p.value))
}
##
## Vital (40->56): Control delta=-0.1162, BEP delta=-0.4289, diff p=0.1011
##
## Misame (1->2): Control delta=-1.2571, BEP delta=-1.1287, diff p=0.2888
##
## Misame (2->3): Control delta=-0.2484, BEP delta=-0.2906, diff p=0.6516
vel_data <- bind_rows(
vital_vel %>% mutate(study = "Vital (day 40->56)"),
misame_v12 %>% mutate(study = "Misame (visit 1->2)"),
misame_v23 %>% mutate(study = "Misame (visit 2->3)")
)
ggplot(vel_data, aes(x = treatment, y = delta_protein, fill = treatment)) +
geom_boxplot(alpha = 0.7, outlier.size = 1) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
facet_wrap(~study) +
labs(title = "Change in Milk Protein Between Visits by Treatment",
x = "", y = "Delta Protein (g/dL)") +
scale_fill_manual(values = trt_colors) +
theme(legend.position = "none")
anm_note("No significant BEP effect on protein velocity. In Vital, the BEP group shows a non-significant trend toward faster protein decline between day 40 and 56, which is consistent with the transient treatment effect observed in the visit-specific analysis -- the BEP-treated group starts higher at day 40 and regresses toward the control level by day 56.")
This section is purely descriptive. BEP protein source (chickpea in Mumta-LW vs peanut in Misame) is completely confounded by study, population, background diet, and implementation.
| Misame-3 (Burkina Faso) | Mumta-LW (Pakistan) | |
|---|---|---|
| BEP formulation | Peanut-based | Chickpea-based |
| Control condition | IFA throughout | No study supplement |
| BMI, mean (SD) | 22.0 (3.2) | 19.8 (1.7) |
| % BMI < 18.5 | 7.2% | 20.0% |
| MUAC, mean (SD) | 26.1 (2.7) | 21.5 (1.2) |
| Visits | 3 (14d, 1-2mo, 3-4mo) | 2 (day 40, day 56) |
# Combine macro + AA effects
all_trt_df <- bind_rows(
trt_macro_df %>% mutate(domain = "Macronutrient"),
aa_trt_df %>% mutate(domain = "Amino Acid", outcome = aa_label)
)
# Add B-vitamins if available
bvit_vars <- intersect(c("thiamine", "riboflavin", "niacin", "pantothenate",
"pyridoxal", "biotin", "folate", "cobalamin",
"b1", "b2", "b3", "b5", "b6", "b7", "b9", "b12"),
names(d))
if (length(bvit_vars) > 0) {
bvit_trt <- list()
for (bv in bvit_vars) {
for (s in c("Misame", "Vital")) {
ds <- d %>% filter(study == s, !is.na(.data[[bv]]))
res <- run_trt_effect(ds, bv, covars, s)
if (!is.null(res)) {
res$domain <- "B-vitamin"
bvit_trt[[paste(bv, s)]] <- res
}
}
}
if (length(bvit_trt) > 0) {
all_trt_df <- bind_rows(all_trt_df, bind_rows(bvit_trt))
}
}
# Plot all domains
ggplot(all_trt_df, aes(x = est, y = reorder(outcome, est), color = study)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
geom_errorbarh(aes(xmin = ci_lo, xmax = ci_hi), height = 0.3, linewidth = 0.6, position = position_dodge(0.5)) +
geom_point(size = 2.5, position = position_dodge(0.5)) +
facet_wrap(~domain, scales = "free", ncol = 1) +
labs(x = "BEP Treatment Effect (95% CI)", y = "", color = "Study") +
scale_color_manual(values = study_colors) +
theme(legend.position = "top", text = element_text(size = 12))
anm_note("The cross-study comparison shows that BEP effects are consistently larger in Vital/Mumta-LW across macronutrients, some amino acids, and B-vitamins. However, this pattern is descriptive only -- BEP formulation, control conditions, population, and dietary context all differ between studies. The chickpea-versus-peanut question cannot be answered from these data alone.")
Misame (MISAME-3): Both control and intervention arms received iron-folic acid (IFA) supplementation until 6 weeks postpartum. The control condition is IFA-only, while intervention arms added BEP during pregnancy and/or postpartum.
Vital (VITAL-Lactation): The control arm received no study supplement during the lactation period. The intervention arms received BEP (nutrient supplement) plus exclusive breastfeeding promotion, with or without azithromycin.
This means that comparing treatment effects across studies conflates BEP efficacy with differences in control conditions (IFA vs no supplement), population nutritional status, BEP formulation, and implementation context. The larger treatment effects in Vital could partly reflect a greater marginal impact of supplementation against a lower-nutrient control condition.
| Feature | Misame (Burkina Faso) | Mumta-LW (Pakistan) |
|---|---|---|
| Protein source | Peanut-based | Chickpea-based (may also contain whey – to be confirmed with trial team) |
| Control condition | IFA (iron-folic acid) throughout pregnancy + 6 weeks postpartum | No study supplement during lactation |
| Study design | 2x2 factorial (prenatal x postnatal BEP) | 3-arm (Control, BEP+EBF, BEP+EBF+AZT) |
| Enrollment criterion | General population | Low MUAC (< 23 cm) |
| Population BMI | Mean 22.0, 7% underweight | Mean 19.8, 20% underweight |
| Milk collection | 14-21 days, 1-2 months, 3-4 months | Day 40, Day 56 |
These differences mean that any cross-study comparison conflates multiple factors. The stronger BEP effects in Vital cannot be attributed to any single factor (formulation, nutritional status, or control condition).
# Compile comprehensive results table
# Build B-vitamin treatment effects df for inclusion
bvit_trt_for_summary <- if (exists("bvit_trt") && length(bvit_trt) > 0) {
bind_rows(bvit_trt) %>%
mutate(analysis = "Treatment effect", modifier = NA_character_, fdr_p = NA_real_)
} else NULL
all_results <- bind_rows(
# Macro treatment effects
trt_macro_df %>% mutate(analysis = "Treatment effect", domain = "Macronutrient",
modifier = NA_character_, fdr_p = NA_real_),
# AA treatment effects
aa_trt_df %>% mutate(analysis = "Treatment effect", domain = "Amino acid",
outcome = aa_label, modifier = NA_character_),
# AA group effects
aa_group_df %>% mutate(analysis = "Treatment effect", domain = "AA group",
modifier = NA_character_),
# Proteomics treatment effects
if (exists("prot_trt_df") && nrow(prot_trt_df) > 0) {
prot_trt_df %>% mutate(analysis = "Treatment effect", domain = "Milk protein",
modifier = NA_character_, fdr_p = NA_real_)
},
# B-vitamin treatment effects
bvit_trt_for_summary,
# Macro EM continuous
em_macro_df %>% transmute(analysis = "BMI/MUAC interaction", domain = "Macronutrient",
outcome = outcome_label, study, est = int_est, se = int_se,
ci_lo = int_est - 1.96*int_se, ci_hi = int_est + 1.96*int_se,
p = int_p, fdr_p = NA_real_, n, modifier = modifier_label),
# AA EM continuous (BMI + MUAC)
aa_em_cont_df %>% transmute(analysis = "AA anthropometry interaction", domain = "Amino acid",
outcome = aa_label, study, est = int_est, se = int_se,
ci_lo = int_est - 1.96*int_se, ci_hi = int_est + 1.96*int_se,
p = int_p, fdr_p, n, modifier = recode(modifier, mbmi = "BMI", mmuaccm = "MUAC")),
# SGA AGA results
if (exists("sga_aga_aa_df") && nrow(sga_aga_aa_df) > 0) {
sga_aga_aa_df %>% transmute(analysis = "SGA10 vs AGA", domain = "Amino acid (SGA)",
outcome = aa_label, study, est, se,
ci_lo = est - 1.96*se, ci_hi = est + 1.96*se,
p, fdr_p, n, modifier = NA_character_)
},
# B-vitamin deficiency interaction
if (exists("bvit_int_df") && nrow(bvit_int_df) > 0) {
bvit_int_df %>% transmute(analysis = "B-vit deficiency interaction", domain = "Macronutrient",
outcome = "protein", study, est = int_est, se = int_se,
ci_lo = int_est - 1.96*int_se, ci_hi = int_est + 1.96*int_se,
p = int_p, fdr_p = NA_real_, n, modifier)
},
# Visit-specific treatment effects
if (exists("visit_trt_df") && nrow(visit_trt_df) > 0) {
visit_trt_df %>% mutate(analysis = paste0("Treatment effect (visit ", visit, ")"),
domain = "Macronutrient", modifier = NA_character_, fdr_p = NA_real_)
}
) %>%
select(analysis, domain, outcome, study, est, ci_lo, ci_hi, p, fdr_p, n)
# Save CSV
write.csv(all_results,
file = paste0(here::here(), "/exploratory_analyses/sunny_bmgf_2026_04/results_table.csv"),
row.names = FALSE)
cat("Total results:", nrow(all_results), "rows saved to results_table.csv\n\n")
## Total results: 225 rows saved to results_table.csv
all_results %>%
mutate(across(c(est, ci_lo, ci_hi), ~round(., 4)),
p = round(p, 4), fdr_p = round(fdr_p, 4),
sig = add_stars(p)) %>%
datatable(caption = "Master Results Table (searchable; full data also saved to CSV)",
filter = "top", rownames = FALSE,
options = list(pageLength = 20, scrollX = TRUE,
order = list(list(7, "asc")))) %>%
formatStyle("p", backgroundColor = styleInterval(c(0.01, 0.05, 0.10),
c("#FDAE61", "#FEE08B", "#FFFFBF", "white")))
These tables are provided for reference. The main results are presented as forest plots in the body of the report. Tables are searchable and sortable.
trt_macro_df %>%
mutate(across(c(est, se, ci_lo, ci_hi), ~round(., 4)), p = round(p, 4),
sig = add_stars(p)) %>%
datatable(caption = "Macronutrient Treatment Effects by Study (SD units)",
rownames = FALSE,
options = list(pageLength = 10, order = list(list(4, "asc")))) %>%
formatStyle("p", backgroundColor = styleInterval(c(0.01, 0.05, 0.10),
c("#FDAE61", "#FEE08B", "#FFFFBF", "white")))
aa_trt_df %>%
mutate(across(c(est, se, ci_lo, ci_hi), ~round(., 4)),
p = round(p, 4), fdr_p = round(fdr_p, 4),
sig = add_stars(p)) %>%
arrange(study, p) %>%
select(aa_label, study, est, ci_lo, ci_hi, p, fdr_p, sig, n) %>%
datatable(caption = "Individual AA Treatment Effects (SD units)",
filter = "top", rownames = FALSE,
options = list(pageLength = 20, order = list(list(5, "asc")))) %>%
formatStyle("p", backgroundColor = styleInterval(c(0.01, 0.05, 0.10),
c("#FDAE61", "#FEE08B", "#FFFFBF", "white")))
prot_trt_df %>%
mutate(across(c(est, se, ci_lo, ci_hi), ~round(., 4)), p = round(p, 4),
sig = add_stars(p)) %>%
datatable(caption = "Major Milk Protein Treatment Effects (SD units)",
rownames = FALSE,
options = list(pageLength = 15, order = list(list(4, "asc")))) %>%
formatStyle("p", backgroundColor = styleInterval(c(0.01, 0.05, 0.10),
c("#FDAE61", "#FEE08B", "#FFFFBF", "white")))
| Outcome | Definition | Source |
|---|---|---|
| Protein (g/dL) | Total milk protein from MIRIS analyzer | Macronutrient panel |
| Fat (g/dL) | Total milk fat from MIRIS analyzer | Macronutrient panel |
| Energy (kcal/L) | Calculated energy content | Macronutrient panel |
| Individual AAs | Free amino acid concentrations (uM) | Biocrates targeted metabolomics |
| Essential AAs | Mean of His, Ile, Leu, Lys, Met, Phe, Thr, Trp, Val | Computed from Biocrates |
| Non-essential AAs | Mean of Ala, Arg, Asn, Asp, Cys, Gln, Glu, Gly, Pro, Ser, Tyr | Computed from Biocrates |
| BCAAs | Mean of Leu, Ile, Val | Computed from Biocrates |
| EAA/NEAA Ratio | Essential AA mean / Non-essential AA mean | Computed from Biocrates |
| Lactoferrin | P02788 intensity | PBL DIANN untargeted proteomics |
| Alpha-lactalbumin | P00709 intensity | PBL DIANN untargeted proteomics |
| Beta-casein | P05814 intensity | PBL DIANN untargeted proteomics |
| IgA1 | P01876 intensity (also MSD targeted) | PBL DIANN / MSD |
| SGA | Birthweight < 10th centile for GA (intergrowth) | Growth outcomes dataset |
All milk outcome variables (macronutrients, individual amino acids,
amino acid group summaries) were z-score standardized within each study
using scale(). This serves three purposes: (1) it places
all outcomes on a common SD-unit scale, enabling direct comparison of
effect sizes across outcomes with different natural units; (2) it
handles the skewness and large concentration ranges of some analytes;
(3) it makes interaction coefficients interpretable as “SD change in
outcome per unit change in modifier.” Raw concentrations are preserved
internally for computing ratios (e.g., EAA/NEAA) where z-scoring would
be inappropriate. Proteomics intensities from the DIANN platform were
similarly scaled within study.
Two studies were analyzed: Misame-3 (Burkina Faso, 2x2 factorial prenatal/postnatal BEP trial) and Mumta-LW (Pakistan, 3-arm postnatal BEP trial, labeled “Vital” in the data). Treatment arms were collapsed to BEP vs Control following the same logic as the main manuscript: in Misame, IFA-only arms became Control and arms receiving any BEP component became BEP; in Vital, the nutrient supplement arms were collapsed to BEP. Only observations with both treatment assignment and milk data were included. Repeated measures across visits were retained (not averaged), and subject-level clustering was accounted for in all inferential analyses.
Treatment effects were estimated using ordinary least squares regression adjusting for child sex, maternal age, maternal education (years), maternal height, parity, season of delivery, and household wealth index. Maternal BMI and MUAC were deliberately excluded from the covariate set because prenatal BEP could affect these (post-treatment variables).
Standard errors were computed using cluster-robust variance
estimation (sandwich::vcovCL) clustering by subject ID to
account for repeated measures. When cluster-robust estimation failed
(e.g., singleton clusters), heteroscedasticity- consistent (HC1)
standard errors were used as a fallback.
Multiplicative interaction terms were tested via
outcome ~ treatment * modifier + covariates. Both
continuous modifiers (BMI, MUAC, maternal height, B-vitamin deficiency
count) and binary cutoffs (BMI < 18.5, MUAC < 21, height < 150
cm) were examined. For each, the interaction coefficient and its
cluster-robust p-value are reported. Subgroup-specific treatment effects
(ATE above/below cutoff) are derived from the interaction model
coefficients.
We used the grf R package (Athey, Tibshirani &
Wager, 2019) to fit generalized random forests tuned for estimating
conditional average treatment effects (CATEs). Key specifications:
The test_calibration() function was used to assess
whether the forest detects real heterogeneity. This tests two
hypotheses: (1) the mean forest prediction equals the true ATE
(calibration), and (2) the spread of predictions captures real
heterogeneity (differential calibration). A non-significant p-value for
the differential test indicates the data do not support meaningful
treatment effect heterogeneity.
Variable importance reflects the frequency with which each covariate was used for splitting treatment effects across the forest.
Two complementary methods were used to identify predictors of milk protein:
Lasso regression (glmnet, alpha=1):
L1-penalized regression with 10-fold cross-validation. The
lambda.1se rule was used (most regularized model within 1
SE of minimum CV error) to prioritize parsimony. Non-zero coefficients
identify retained predictors.
Random Forest (ranger, 500 trees):
Non-parametric ensemble method. Variable importance was based on
impurity reduction (increase in sum of squared errors when a variable is
used for splitting). Out-of-bag R-squared provides an unbiased estimate
of predictive performance.
Both models included all available baseline covariates plus study and visit indicators. Categorical variables were dummy-coded for the Lasso. Models were compared with and without the study indicator to assess whether site dominates prediction.
A composite deficiency score was created as the count of B-vitamins (B1, B2, B3, B6, B12) with milk concentrations below the study-specific 25th percentile. This threshold is a proxy for estimated average requirement (EAR) deficiency when population-level cutoffs are unavailable. The score ranges from 0 (no deficiency) to 5 (all five B-vitamins below the 25th percentile). Both the continuous count and a binary (any deficiency vs replete) version were tested as effect modifiers.
All 20 free amino acids from the Biocrates targeted metabolomics panel were analyzed. Group summaries (essential, non-essential, BCAA) were computed as arithmetic means of raw concentrations. Because individual AAs have different concentration scales, these means are weighted toward high-concentration AAs (e.g., glutamate). Standardization was considered but not applied to maintain interpretability in original units.
FDR correction (Benjamini-Hochberg) was applied within each study across the 20 amino acids. Both nominal and FDR-adjusted p-values are reported per the request to provide discussion material even for signals that do not survive multiple testing correction.
Six major milk proteins were extracted from the untargeted DIANN proteomics platform by matching UniProt accession codes: lactoferrin (P02788), alpha- lactalbumin (P00709), beta-casein (P05814), kappa-casein (P07498), osteopontin (P10451), and IgA1 (P01876). Columns with >50% missing values were set to NA (consistent with the main analysis pipeline). Protein intensities were used as-is (not log-transformed) for consistency with the treatment effect framework.
SGA was defined as birthweight < 10th centile for gestational age using INTERGROWTH-21st standards. Three comparator definitions were used:
All SGA models adjust for treatment arm (since BEP was randomized and could independently affect both milk and birth outcomes). A control-only sensitivity analysis was also conducted to verify that results are not driven by BEP-mediated confounding.
Within-subject change in protein between consecutive visits was computed using wide-format pivoting. Only subjects with both visits were included. Treatment effects on the delta were tested using two-sample t-tests (Welch).
Throughout this report, we distinguish between:
For exploratory analyses presented for discussion purposes, we report both and do not restrict interpretation to FDR-significant results only.
cat("Main analysis (BEP vs Control, Vital + Misame):\n")
## Main analysis (BEP vs Control, Vital + Misame):
print(table(d$study, d$treatment))
##
## Control BEP
## Misame 396 449
## Vital 100 200
cat("\nProteomics subsample:\n")
##
## Proteomics subsample:
print(table(d_prot$study, d_prot$treatment))
##
## Control BEP
## Misame 50 56
## Vital 50 100
cat("\nSGA refined (SGA10 vs AGA):\n")
##
## SGA refined (SGA10 vs AGA):
if (exists("d_sga_aga")) print(table(d_sga_aga$study, d_sga_aga$sga_group))
##
## AGA SGA10
## Misame 616 212
## Vital 180 114
cat("\nSGA refined (SGA10 vs Healthy AGA):\n")
##
## SGA refined (SGA10 vs Healthy AGA):
if (exists("d_sga_healthy")) print(table(d_sga_healthy$study, d_sga_healthy$sga_group))
##
## Healthy AGA SGA10
## Misame 596 212
## Vital 156 114
cat("\nB-vitamin deficiency subsample: n =", nrow(d_bvit), "\n")
##
## B-vitamin deficiency subsample: n = 1133
cat("ML predictor screen: n =", nrow(d_ml), "\n")
## ML predictor screen: n = 1133
cat("\nCausal forest:\n")
##
## Causal forest:
if (exists("cf_results")) {
for (s in names(cf_results)) cat(sprintf(" %s: n = %d\n", s, nrow(cf_results[[s]])))
}
## Misame: n = 833
## Vital: n = 300