library(rstanarm)
## Loading required package: Rcpp
## This is rstanarm version 2.32.2
## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores())
library(loo)
## This is loo version 2.8.0
## - Online documentation and vignettes at mc-stan.org/loo
## - As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session.
## - Windows 10 users: loo may be very slow if 'mc.cores' is set in your .Rprofile file (see https://github.com/stan-dev/loo/issues/94).
library(palmerpenguins) # provides `penguins`
##
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
# Use complete cases for needed columns
dat <- na.omit(penguins[, c("body_mass_g", "bill_length_mm", "flipper_length_mm")])
set.seed(123)
# Models
b1 <- stan_glm(body_mass_g ~ 1, data = dat, refresh = 0)
b2 <- stan_glm(body_mass_g ~ bill_length_mm, data = dat, refresh = 0)
b3 <- stan_glm(body_mass_g ~ bill_length_mm + flipper_length_mm, data = dat, refresh = 0)
# Criteria
waic_b1 <- waic(b1); waic_b2 <- waic(b2); waic_b3 <- waic(b3)
loo_b1 <- loo(b1); loo_b2 <- loo(b2); loo_b3 <- loo(b3)
# Compare (best first)
loo_compare(list(b1 = waic_b1, b2 = waic_b2, b3 = waic_b3)) # WAIC
## elpd_diff se_diff
## b3 0.0 0.0
## b2 -168.2 14.1
## b1 -241.8 14.7
loo_compare(list(b1 = loo_b1, b2 = loo_b2, b3 = loo_b3)) # LOO (preferred)
## elpd_diff se_diff
## b3 0.0 0.0
## b2 -168.1 14.1
## b1 -241.8 14.7
# ------------------ FULL, SELF-CONTAINED CODE ------------------
# Action box helper (HTML)
action_box <- function(title, items, border = "#000000", bg = "#ff69b4") { # purple border, light green bg
html <- paste0(
'<div style="border-left:6px solid ', border,
';padding:12px 14px;background:', bg,
';border-radius:10px;margin:14px 0 6px 0">',
'<h4 style="margin:0 0 8px 0;color:', border, ';font-weight:bold;">',
title, '</h4>',
'<ul style="margin:0;list-style-type:disc;color:black;">',
paste0("<li>", items, "</li>", collapse = ""),
"</ul></div>"
)
cat(html)
}
action_box(
title = "Action Items 1: Interpreting the WAIC/LOO Tables",
items = c(
# Bullet 1 — bold prompt + inline answer (same style as Action Items 2)
"<b>Which model has the lowest WAIC/LOO?</b> WAIC ≈ 5063.3 and LOOIC ≈ 5063.2.",
# Bullet 2 — bold prompt + multi-line details separated by <br/>
paste0(
"<b>Look at the ΔWAIC/ΔLOO values:</b><br/>",
"Which models are essentially equivalent? Both models are practically the same.<br/>",
"Which have very little support? b2 has very little support; b1 has none.<br/>",
"Compare the results with the AIC table from earlier. Do they agree? Yes—b3 > b2 > b1.<br/>",
"<b>One-sentence summary:</b> Based on WAIC/LOO, the two-predictor model is best, ",
"the single-predictor model has little support, and the intercept-only model has virtually none."
)
),
border = "#000000", # same green as Action Items 2
bg = "#ff69b4" # same light-orange background
)
# --- SIMULATE DATA FROM THE DAG ---
set.seed(42)
n <- 400
# Confounder: neighborhood income (standardized)
Income <- rnorm(n, 0, 1) # z scores, from low to high income
# Sunlight exposure (affected by income; wealthier areas might have different canopy/lot structure)
Sun <- 0.6*Income + rnorm(n, 0, 1)
# Species choice depends on income (wealthier neighborhoods plant different species)
# Species: A (baseline) vs B
pB <- plogis(-0.25 + 0.9*Income) # richer -> more likely Species B
Species_B <- rbinom(n, 1, pB)
Species <- factor(ifelse(Species_B==1, "B", "A"))
# Photosynthetic method determined by species (B more likely C4)
# Method: 0 = C3, 1 = C4
pC4 <- ifelse(Species=="B", 0.75, 0.15)
Method <- rbinom(n, 1, pC4)
# TRUE data-generating process for photosynthetic efficiency (the outcome)
# Efficiency is influenced by Species (direct), Method (mediator), and Sun (co-parent via Income)
# (Note: Income does NOT directly cause Efficiency; its influence is via Species/Sun)
beta_species <- 0.35 # total species effect partly direct
beta_method <- 0.65 # mediator effect (C4 tends to be more efficient)
beta_sun <- 0.55 # more sun -> higher efficiency
eps_y <- rnorm(n, 0, 0.9)
Efficiency <- beta_species*(Species=="B") + beta_method*Method + beta_sun*Sun + eps_y
# Collider/descendant measured post-photosynthesis: Starch (don’t control for it in causal model!)
# Starch is influenced by Efficiency (upstream) and Species (storage physiology)
Starch <- 0.8*Efficiency + 0.4*(Species=="B") + rnorm(n, 0, 0.8)
mydata <- data.frame(Income, Sun, Species, Method=factor(Method, levels=c(0,1), labels=c("C3","C4")),
Efficiency, Starch)
str(mydata)
## 'data.frame': 400 obs. of 6 variables:
## $ Income : num 1.371 -0.565 0.363 0.633 0.404 ...
## $ Sun : num 2.157 -1.208 0.273 0.429 -0.336 ...
## $ Species : Factor w/ 2 levels "A","B": 1 1 1 2 1 2 2 2 2 1 ...
## $ Method : Factor w/ 2 levels "C3","C4": 2 1 1 1 1 2 2 1 2 1 ...
## $ Efficiency: num 1.165 -0.632 0.441 0.928 0.604 ...
## $ Starch : num 0.955 1.042 1.023 0.994 0.91 ...
# --- DAG for reference and adjustment logic ---
library(dagitty)
library(ggdag)
##
## Attaching package: 'ggdag'
## The following object is masked from 'package:stats':
##
## filter
dag <- dagitty('dag {
Income -> Species
Income -> Sun
Species -> Method
Method -> Efficiency
Sun -> Efficiency
Species -> Efficiency
Efficiency -> Starch
Species -> Starch
}')
ggdag(dag, layout = "nicely") +
geom_dag_point(size = 20) + # make nodes larger
geom_dag_edges(edge_width = 0.8) + # thicker edges
geom_dag_text(size = 3) + # bigger labels
theme_dag()
# ------------------ FULL, SELF-CONTAINED CODE ------------------
# Action box helper (HTML)
action_box <- function(title, items, border = "#1976d2", bg = "#fde2f3") { # purple border, light green bg
html <- paste0(
'<div style="border-left:6px solid ', border,
';padding:12px 14px;background:', bg,
';border-radius:10px;margin:14px 0 6px 0">',
'<h4 style="margin:0 0 8px 0;color:', border, ';font-weight:bold;">',
title, '</h4>',
'<ul style="margin:0;list-style-type:disc;color:black;">',
paste0("<li>", items, "</li>", collapse = ""),
"</ul></div>"
)
cat(html)
}
action_box(
title = "Action Items 2: Interpreting the DAG",
items = c(
"<b>Fork (Confounding):</b>
Do you see any variable that points into both Species and Sun (a common cause)? Income
How would this confound the effect of Species on Efficiency? It opens the backdoor path",
"<b>Pipe (Mediator):</b>
Trace the path Species -> Method -> Efficiency.
Is Method a confounder, mediator, or collider?
I believe that this method is the mediator.",
"Should you control for it if your goal is the total effect of Species on Efficiency?
You should not control if you want total effect",
"<b>Collider (Selection Bias):</b>
Look at Efficiency -> Starch <- Species.
Why is Starch a collider here? Two arrows converge into starch which is the collider
What happens if you condition on Starch when estimating the effect of Species on Efficiency? It opens a fake association between species and efficency
",
"<b>Adjustment Set:</b>
According to the DAG, which variables should you adjust for when estimating the causal effect of Species on Efficiency? Income
Which variables must you not adjust for? Method and stretch"
),
border = "#1565c0", bg = "#f8bbd0"
)
Fork (Confounding):
Do you see any variable that points into both Species and Sun (a common cause)? Income
How would this confound the effect of Species on Efficiency? It opens the backdoor pathPipe (Mediator):
Trace the path Species -> Method -> Efficiency. Is Method a confounder, mediator, or collider?
I believe that this method is the mediator.Collider (Selection Bias): Look at Efficiency -> Starch <- Species.
Why is Starch a collider here? Two arrows converge into starch which is the collider
What happens if you condition on Starch when estimating the effect of Species on Efficiency? It opens a fake association between species and efficencyAdjustment Set:
According to the DAG, which variables should you adjust for when estimating the causal effect of Species on Efficiency? Income
Which variables must you not adjust for? Method and stretch# --- FIT MODELS ---
library(rstanarm)
library(loo)
# Predictive model: include anything that helps prediction (even mediator/collider descendants)
# (Intentionally includes Method and Starch)
pred_model <- stan_glm(Efficiency ~ Species + Method + Sun + Income + Starch,
data = mydata, refresh = 0)
# Causal model: effect of Species on Efficiency
# Adjust for the confounder Income (and optionally include Sun as a precision variable).
# Do NOT adjust for Method (mediator) or Starch (collider/descendant).
causal_model <- stan_glm(Efficiency ~ Species + Income + Sun,
data = mydata, refresh = 0)
# How to write summary output in a Bayesian model
library(broom.mixed) # need to add
library(broom) # need to add
# The below will give you the model outputs, but we will come back to this.
est_pred <- tidy(pred_model, conf.int = TRUE, conf.level = 0.90)
est_caus <- tidy(causal_model, conf.int = TRUE, conf.level = 0.90)
rbind(
subset(est_pred, term=="SpeciesB")[,c("term","estimate","conf.low","conf.high")],
subset(est_caus, term=="SpeciesB")[,c("term","estimate","conf.low","conf.high")]
)
## # A tibble: 2 × 4
## term estimate conf.low conf.high
## <chr> <dbl> <dbl> <dbl>
## 1 SpeciesB 0.0711 -0.0726 0.213
## 2 SpeciesB 0.902 0.741 1.06
# --- MODEL COMPARISON: WAIC & LOO ---
waic_pred <- waic(pred_model)
waic_caus <- waic(causal_model)
loo_pred <- loo(pred_model)
loo_caus <- loo(causal_model)
# Preferred: LOO
loo_compare(list(causal = loo_caus, pred = loo_pred))
## elpd_diff se_diff
## pred 0.0 0.0
## causal -148.3 14.3
# Also show WAIC (should agree)
loo_compare(list(causal = waic_caus, pred = waic_pred))
## elpd_diff se_diff
## pred 0.0 0.0
## causal -148.3 14.3
# --- SPECIES EFFECT: PREDICTIVE VS CAUSAL ---
posterior_pred <- as.data.frame(posterior_interval(pred_model, prob = 0.9))
posterior_caus <- as.data.frame(posterior_interval(causal_model, prob = 0.9))
# Extract SpeciesB coefficient intervals from both models
coef_names <- rownames(posterior_pred)
idx_pred <- which(grepl("^SpeciesB$", coef_names))
idx_caus <- which(rownames(posterior_caus) == "SpeciesB")
data.frame(
model = c("Predictive", "Causal"),
low_90 = c(posterior_pred[idx_pred,1], posterior_caus[idx_caus,1]),
high_90 = c(posterior_pred[idx_pred,2], posterior_caus[idx_caus,2])
)
## model low_90 high_90
## 1 Predictive -0.0725803 0.2125913
## 2 Causal 0.7413986 1.0596023
library(broom.mixed)
# OPTIONAL: show posterior means side-by-side for readability
library(broom)
est_pred <- tidy(pred_model, conf.int = TRUE, conf.level = 0.90)
est_caus <- tidy(causal_model, conf.int = TRUE, conf.level = 0.90)
subset(est_pred, term=="SpeciesB")[,c("term","estimate","conf.low","conf.high")]
## # A tibble: 1 × 4
## term estimate conf.low conf.high
## <chr> <dbl> <dbl> <dbl>
## 1 SpeciesB 0.0711 -0.0726 0.213
subset(est_caus, term=="SpeciesB")[,c("term","estimate","conf.low","conf.high")]
## # A tibble: 1 × 4
## term estimate conf.low conf.high
## <chr> <dbl> <dbl> <dbl>
## 1 SpeciesB 0.902 0.741 1.06
# ------------------ FULL, SELF-CONTAINED CODE ------------------
# Action box helper (HTML)
action_box <- function(title, items, border = "#2e7d32", bg = "#ffcc80") { # purple border, light green bg
html <- paste0(
'<div style="border-left:6px solid ', border,
';padding:12px 14px;background:', bg,
';border-radius:10px;margin:14px 0 6px 0">',
'<h4 style="margin:0 0 8px 0;color:', border, ';font-weight:bold;">',
title, '</h4>',
'<ul style="margin:0;list-style-type:disc;color:black;">',
paste0("<li>", items, "</li>", collapse = ""),
"</ul></div>"
)
cat(html)
}
# Call the function
action_box(
title = "Action Items 3: Interpret_This_Example",
items = c(
# Items (no trailing comma, each item fully inside quotes)
items <- c(
"<b>Which model predicts better according to WAIC/LOO? Why is that expected?</b>
Predictive model because it tops waic/loo. This is expected because it is tuned to make the best forcast
",
"<b>How does the SpeciesB effect differ between predictive and causal models?</b>
Predictive is near zero and causal is positive",
"<b>Which model would you trust if your goal is prediction?</b>
Prediction because it uses the predictive model",
"<b>Which model would you trust if your goal is causal inference?</b>
Casual inference because it uses the causal model",
"<b>In one sentence, explain why prediction ≠ causation in this example.</b>
Predictive accuracy does not mean a true causal effect using a collider variable can make predictions sharper while baising the species B effect"
)
),
border = "#2e7d32", bg = "#ffcc80"
)
Which model predicts better according to WAIC/LOO? Why is that expected?
Predictive model because it tops waic/loo. This is expected because it is tuned to make the best forcastHow does the SpeciesB effect differ between predictive and causal models?
Predictive is near zero and causal is positiveWhich model would you trust if your goal is prediction?
Prediction because it uses the predictive modelWhich model would you trust if your goal is causal inference?
Casual inference because it uses the causal modelIn one sentence, explain why prediction ≠ causation in this example.
Predictive accuracy does not mean a true causal effect using a collider variable can make predictions sharper while baising the species B effect