Data Generation: Balanced Mean & Variance, Imbalanced Skewness

## ---- data-corrected ----
library(moments)

# Sample sizes
n_treat   <- 40
n_control <- 100

# Target parameters
mu  <- 10
var_target <- 2
sd_target  <- sqrt(var_target)

# 1) Treated: symmetric (approx zero skew) — Normal
X1_treat <- rnorm(n_treat, mean = mu, sd = sd_target)

# 2) Control: start from a skewed distribution (Gamma shape=1 has skew=2)
#    Then standardize and rescale to have mean=mu and var=var_target.
X_raw_ctrl <- rgamma(n_control, shape = 1, scale = 1)  # raw skew ~2
X1_control <- (X_raw_ctrl - mean(X_raw_ctrl)) / sd(X_raw_ctrl) * sd_target + mu

# Combine
X1 <- c(X1_treat, X1_control)
W  <- c(rep(1, n_treat), rep(0, n_control))

# Diagnostics (sample)
cat("Treated:   mean=", round(mean(X1_treat),4),
    "  var=", round(var(X1_treat),4),
    "  skew=", round(skewness(X1_treat),4), "\n")
## Treated:   mean= 10.0639   var= 1.612   skew= -0.0702
cat("Control:   mean=", round(mean(X1_control),4),
    "  var=", round(var(X1_control),4),
    "  skew=", round(skewness(X1_control),4), "\n")
## Control:   mean= 10   var= 2   skew= 1.7889
cat("Check mean diff:", round(mean(X1_treat)-mean(X1_control),6), "\n")
## Check mean diff: 0.063899
cat("Check var diff: ", round(var(X1_treat)-var(X1_control),6), "\n")
## Check var diff:  -0.387965

Conditions Verification: - Mean balance: Treatment (10.06) ≈ Control (10) - Variance balance: Treatment (1.61) ≈ Control (2) - Skewness imbalance: Treatment (-0.07) ≠ Control (1.79)

Generate Outcome with Quartic Treatment Effect

# Generate observed outcome Y
# Baseline: Y = 5 + X1
# Treatment effect: adds 2*X1^4 if treated (QUARTIC)
Y <- 5 + X1 + W * (2 * X1^4)

Estimands: ATT, ATC, ATE

# True treatment effect for each unit: tau_i = 2 * X1^4
tau <- 2 * X1^4

# Calculate estimands
ATT <- mean(tau[W == 1])  # Average treatment effect on treated
ATC <- mean(tau[W == 0])  # Average treatment effect on controls

# ATE as weighted average of ATT and ATC
p_treat <- mean(W)  # Proportion treated
ATE <- p_treat * ATT + (1 - p_treat) * ATC

cat("ATT (treatment effect on treated):  ", format(round(ATT, 2), big.mark=","), "\n")
## ATT (treatment effect on treated):   22,427.84
cat("ATC (treatment effect on controls): ", format(round(ATC, 2), big.mark=","), "\n")
## ATC (treatment effect on controls):  22,827.33
cat("ATE (weighted average):             ", format(round(ATE, 2), big.mark=","), "\n\n")
## ATE (weighted average):              22,713.19
cat("ATE = p*ATT + (1-p)*ATC\n")
## ATE = p*ATT + (1-p)*ATC
cat("    = ", round(p_treat, 3), "*", format(round(ATT, 2), big.mark=","), " + ", 
    round(1-p_treat, 3), "*", format(round(ATC, 2), big.mark=","), "\n")
##     =  0.286 * 22,427.84  +  0.714 * 22,827.33
cat("    = ", format(round(ATE, 2), big.mark=","), "\n")
##     =  22,713.19

Results: With quartic treatment effect and skewness imbalance: - ATT = 22,427.84 - ATC = 22,827.33 - ATE = 22,713.19

Bias Decomposition

# Naive difference in means
naive_diff <- mean(Y[W == 1]) - mean(Y[W == 0])

# Selection bias: baseline difference between groups
Y0_treated <- 5 + X1[W == 1]
Y0_control <- 5 + X1[W == 0]
selection_bias <- mean(Y0_treated) - mean(Y0_control)

# Heterogeneity bias: difference between ATT and ATE
heterogeneity_bias <- ATT - ATE

# Total bias
bias <- naive_diff - ATE

cat("Naive difference in means:                ", format(round(naive_diff, 2), big.mark=","), "\n")
## Naive difference in means:                 22,427.91
cat("Selection bias (E[Y0|W=1] - E[Y0|W=0]):  ", round(selection_bias, 4), "\n")
## Selection bias (E[Y0|W=1] - E[Y0|W=0]):   0.0639
cat("Heterogeneity bias (ATT - ATE):          ", format(round(heterogeneity_bias, 2), big.mark=","), "\n")
## Heterogeneity bias (ATT - ATE):           -285.35
cat("Total bias (naive_diff - ATE):           ", format(round(bias, 2), big.mark=","), "\n\n")
## Total bias (naive_diff - ATE):            -285.28
# Verify decomposition 1: naive_diff = ATT + selection_bias
decomp1 <- ATT + selection_bias
cat("Decomposition 1: ATT + selection_bias\n")
## Decomposition 1: ATT + selection_bias
cat("                 ", format(round(ATT, 2), big.mark=","), " + ", round(selection_bias, 4), 
    " = ", format(round(decomp1, 2), big.mark=","), "\n")
##                   22,427.84  +  0.0639  =  22,427.91
cat("                 Should equal naive_diff = ", format(round(naive_diff, 2), big.mark=","), "\n")
##                  Should equal naive_diff =  22,427.91
cat("                 Match? ", isTRUE(all.equal(decomp1, naive_diff)), "\n\n")
##                  Match?  TRUE
# Verify decomposition 2: naive_diff = ATE + selection_bias + heterogeneity_bias
decomp2 <- ATE + selection_bias + heterogeneity_bias
cat("Decomposition 2: ATE + selection_bias + heterogeneity_bias\n")
## Decomposition 2: ATE + selection_bias + heterogeneity_bias
cat("                 ", format(round(ATE, 2), big.mark=","), " + ", round(selection_bias, 4), 
    " + ", format(round(heterogeneity_bias, 2), big.mark=","), " = ", format(round(decomp2, 2), big.mark=","), "\n")
##                   22,713.19  +  0.0639  +  -285.35  =  22,427.91
cat("                 Should equal naive_diff = ", format(round(naive_diff, 2), big.mark=","), "\n")
##                  Should equal naive_diff =  22,427.91
cat("                 Match? ", isTRUE(all.equal(decomp2, naive_diff)), "\n")
##                  Match?  TRUE

Summary: The naive difference of 22,427.91 can be decomposed as:

  1. ATT + Selection Bias = 22,427.84 + 0.0639 = 22,427.91

  2. ATE + Selection Bias + Heterogeneity Bias = 22,713.19 + 0.0639 + -285.35 = 22,427.91