# Sample sizes
n_treat <- 40
n_control <- 100
# Generate X1 with variance imbalance
# Treatment: mean=10, SD=1; Control: mean=10, SD=2
X1_treat <- rnorm(n_treat, mean = 10, sd = 1)
X1_control <- rnorm(n_control, mean = 10, sd = 2)
X1 <- c(X1_treat, X1_control)
W <- c(rep(1, n_treat), rep(0, n_control))
# Generate observed outcome Y
# Baseline: Y = 5 + X1
# Treatment effect: adds 2*X1 if treated
Y <- 5 + X1 + W * (2 * X1)
naive_diff <- mean(Y[W == 1]) - mean(Y[W == 0])
The difference in mean outcomes (treatment minus control) is 20.15.
# True treatment effect for each unit: tau_i = 2 * X1
tau <- 2 * X1
# Calculate estimands
ATT <- mean(tau[W == 1]) # Average treatment effect on treated
ATC <- mean(tau[W == 0]) # Average treatment effect on controls
# ATE as weighted average of ATT and ATC
p_treat <- mean(W) # Proportion treated
ATE <- p_treat * ATT + (1 - p_treat) * ATC
# Bias
bias <- naive_diff - ATE
cat("ATT (treatment effect on treated): ", round(ATT, 4), "\n")
## ATT (treatment effect on treated): 20.0904
cat("ATC (treatment effect on controls): ", round(ATC, 4), "\n")
## ATC (treatment effect on controls): 19.9711
cat("ATE (weighted average): ", round(ATE, 4), "\n\n")
## ATE (weighted average): 20.0052
cat("ATE = p*ATT + (1-p)*ATC\n")
## ATE = p*ATT + (1-p)*ATC
cat(" = ", round(p_treat, 3), "*", round(ATT, 4), " + ",
round(1-p_treat, 3), "*", round(ATC, 4), "\n")
## = 0.286 * 20.0904 + 0.714 * 19.9711
cat(" = ", round(ATE, 4), "\n\n")
## = 20.0052
No, this is not an unbiased estimate.
If not ATE, what is it? The naive difference estimates ATT + Selection Bias.
# Selection bias: baseline difference between groups
Y0_treated <- 5 + X1[W == 1]
Y0_control <- 5 + X1[W == 0]
selection_bias <- mean(Y0_treated) - mean(Y0_control)
# Heterogeneity bias: difference between ATT and ATE
heterogeneity_bias <- ATT - ATE
cat("Selection bias (E[Y0|W=1] - E[Y0|W=0]):", round(selection_bias, 4), "\n")
## Selection bias (E[Y0|W=1] - E[Y0|W=0]): 0.0596
cat("Heterogeneity bias (ATT - ATE): ", round(heterogeneity_bias, 4), "\n")
## Heterogeneity bias (ATT - ATE): 0.0852
cat("Total bias (naive_diff - ATE): ", round(bias, 4), "\n\n")
## Total bias (naive_diff - ATE): 0.1448
# Verify decomposition 1: naive_diff = ATT + selection_bias
decomp1 <- ATT + selection_bias
cat("Decomposition 1: ATT + selection_bias\n")
## Decomposition 1: ATT + selection_bias
cat(" ", round(ATT, 4), " + ", round(selection_bias, 4),
" = ", round(decomp1, 4), "\n")
## 20.0904 + 0.0596 = 20.15
cat(" Should equal naive_diff = ", round(naive_diff, 4), "\n")
## Should equal naive_diff = 20.15
cat(" Match? ", isTRUE(all.equal(decomp1, naive_diff)), "\n\n")
## Match? TRUE
# Verify decomposition 2: naive_diff = ATE + selection_bias + heterogeneity_bias
decomp2 <- ATE + selection_bias + heterogeneity_bias
cat("Decomposition 2: ATE + selection_bias + heterogeneity_bias\n")
## Decomposition 2: ATE + selection_bias + heterogeneity_bias
cat(" ", round(ATE, 4), " + ", round(selection_bias, 4),
" + ", round(heterogeneity_bias, 4), " = ", round(decomp2, 4), "\n")
## 20.0052 + 0.0596 + 0.0852 = 20.15
cat(" Should equal naive_diff = ", round(naive_diff, 4), "\n")
## Should equal naive_diff = 20.15
cat(" Match? ", isTRUE(all.equal(decomp2, naive_diff)), "\n")
## Match? TRUE
Summary: The naive difference of 20.15 can be decomposed as:
ATT + Selection Bias = 20.0904 + 0.0596 = 20.15
ATE + Selection Bias + Heterogeneity Bias = 20.0052 + 0.0596 + 0.0852 = 20.15
The selection bias arises because treated and control groups have different X1 distributions (same mean but different variances). The heterogeneity bias arises because the treatment effect varies with X1, making ATT ≠ ATE.