## ---- data-corrected ----
library(moments)
# Sample sizes
n_treat <- 40
n_control <- 100
# Target parameters
mu <- 10
var_target <- 2
sd_target <- sqrt(var_target)
# 1) Treated: symmetric (approx zero skew) — Normal
X1_treat <- rnorm(n_treat, mean = mu, sd = sd_target)
# 2) Control: start from a skewed distribution (Gamma shape=1 has skew=2)
# Then standardize and rescale to have mean=mu and var=var_target.
X_raw_ctrl <- rgamma(n_control, shape = 1, scale = 1) # raw skew ~2
X1_control <- (X_raw_ctrl - mean(X_raw_ctrl)) / sd(X_raw_ctrl) * sd_target + mu
# Combine
X1 <- c(X1_treat, X1_control)
W <- c(rep(1, n_treat), rep(0, n_control))
# Diagnostics (sample)
cat("Treated: mean=", round(mean(X1_treat),4),
" var=", round(var(X1_treat),4),
" skew=", round(skewness(X1_treat),4), "\n")
## Treated: mean= 10.0639 var= 1.612 skew= -0.0702
cat("Control: mean=", round(mean(X1_control),4),
" var=", round(var(X1_control),4),
" skew=", round(skewness(X1_control),4), "\n")
## Control: mean= 10 var= 2 skew= 1.7889
cat("Check mean diff:", round(mean(X1_treat)-mean(X1_control),6), "\n")
## Check mean diff: 0.063899
cat("Check var diff: ", round(var(X1_treat)-var(X1_control),6), "\n")
## Check var diff: -0.387965
Conditions Verification: - Mean balance: Treatment (10.06) ≈ Control (10) - Variance balance: Treatment (1.61) ≈ Control (2) - Skewness imbalance: Treatment (-0.07) ≠ Control (1.79)
# Generate observed outcome Y
# Baseline: Y = 5 + X1
# Treatment effect: adds 2*X1^4 if treated (QUARTIC)
Y <- 5 + X1 + W * (2 * X1^4)
# True treatment effect for each unit: tau_i = 2 * X1^4
tau <- 2 * X1^4
# Calculate estimands
ATT <- mean(tau[W == 1]) # Average treatment effect on treated
ATC <- mean(tau[W == 0]) # Average treatment effect on controls
# ATE as weighted average of ATT and ATC
p_treat <- mean(W) # Proportion treated
ATE <- p_treat * ATT + (1 - p_treat) * ATC
cat("ATT (treatment effect on treated): ", format(round(ATT, 2), big.mark=","), "\n")
## ATT (treatment effect on treated): 22,427.84
cat("ATC (treatment effect on controls): ", format(round(ATC, 2), big.mark=","), "\n")
## ATC (treatment effect on controls): 22,827.33
cat("ATE (weighted average): ", format(round(ATE, 2), big.mark=","), "\n\n")
## ATE (weighted average): 22,713.19
cat("ATE = p*ATT + (1-p)*ATC\n")
## ATE = p*ATT + (1-p)*ATC
cat(" = ", round(p_treat, 3), "*", format(round(ATT, 2), big.mark=","), " + ",
round(1-p_treat, 3), "*", format(round(ATC, 2), big.mark=","), "\n")
## = 0.286 * 22,427.84 + 0.714 * 22,827.33
cat(" = ", format(round(ATE, 2), big.mark=","), "\n")
## = 22,713.19
Results: With quartic treatment effect and skewness imbalance: - ATT = 22,427.84 - ATC = 22,827.33 - ATE = 22,713.19
# Naive difference in means
naive_diff <- mean(Y[W == 1]) - mean(Y[W == 0])
# Selection bias: baseline difference between groups
Y0_treated <- 5 + X1[W == 1]
Y0_control <- 5 + X1[W == 0]
selection_bias <- mean(Y0_treated) - mean(Y0_control)
# Heterogeneity bias: difference between ATT and ATE
heterogeneity_bias <- ATT - ATE
# Total bias
bias <- naive_diff - ATE
cat("Naive difference in means: ", format(round(naive_diff, 2), big.mark=","), "\n")
## Naive difference in means: 22,427.91
cat("Selection bias (E[Y0|W=1] - E[Y0|W=0]): ", round(selection_bias, 4), "\n")
## Selection bias (E[Y0|W=1] - E[Y0|W=0]): 0.0639
cat("Heterogeneity bias (ATT - ATE): ", format(round(heterogeneity_bias, 2), big.mark=","), "\n")
## Heterogeneity bias (ATT - ATE): -285.35
cat("Total bias (naive_diff - ATE): ", format(round(bias, 2), big.mark=","), "\n\n")
## Total bias (naive_diff - ATE): -285.28
# Verify decomposition 1: naive_diff = ATT + selection_bias
decomp1 <- ATT + selection_bias
cat("Decomposition 1: ATT + selection_bias\n")
## Decomposition 1: ATT + selection_bias
cat(" ", format(round(ATT, 2), big.mark=","), " + ", round(selection_bias, 4),
" = ", format(round(decomp1, 2), big.mark=","), "\n")
## 22,427.84 + 0.0639 = 22,427.91
cat(" Should equal naive_diff = ", format(round(naive_diff, 2), big.mark=","), "\n")
## Should equal naive_diff = 22,427.91
cat(" Match? ", isTRUE(all.equal(decomp1, naive_diff)), "\n\n")
## Match? TRUE
# Verify decomposition 2: naive_diff = ATE + selection_bias + heterogeneity_bias
decomp2 <- ATE + selection_bias + heterogeneity_bias
cat("Decomposition 2: ATE + selection_bias + heterogeneity_bias\n")
## Decomposition 2: ATE + selection_bias + heterogeneity_bias
cat(" ", format(round(ATE, 2), big.mark=","), " + ", round(selection_bias, 4),
" + ", format(round(heterogeneity_bias, 2), big.mark=","), " = ", format(round(decomp2, 2), big.mark=","), "\n")
## 22,713.19 + 0.0639 + -285.35 = 22,427.91
cat(" Should equal naive_diff = ", format(round(naive_diff, 2), big.mark=","), "\n")
## Should equal naive_diff = 22,427.91
cat(" Match? ", isTRUE(all.equal(decomp2, naive_diff)), "\n")
## Match? TRUE
Summary: The naive difference of 22,427.91 can be decomposed as:
ATT + Selection Bias = 22,427.84 + 0.0639 = 22,427.91
ATE + Selection Bias + Heterogeneity Bias = 22,713.19 + 0.0639 + -285.35 = 22,427.91