title: “RCT simulation & bias decomposition” author: “Katia” date: “2025-11-27” output: html_document ———————

# Knit-friendly setup
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
set.seed(123)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Overview

This R Markdown reproduces the simulations and bias-decomposition exercises from Questions 3–5 and includes a short snippet for loading the female_lalonde.csv file (Question 7). Each section contains the data generation, estimands, naive estimator, bias decomposition, and checks. At the end of each section there is a brief summary of the results you can expand on for a report.

Question 3 — Gamma X1 with same mean, different variance

# ---------- 1. Data generation ----------
n_treat   <- 40
n_control <- 100

mean_target <- 10

# Fixed shape so skewness is identical
shape <- 100                 # skew = 2/sqrt(100) = 0.2

# Scale values chosen to force SD = 1 vs SD = 2
scale_treat   <- mean_target / shape            # = 0.1  → sd ≈ 1
scale_control <- (mean_target * 2) / shape      # = 0.2  → sd ≈ 2

# Generate Gamma X1 values
X1_treat   <- rgamma(n_treat,   shape = shape, scale = scale_treat)
X1_control <- rgamma(n_control, shape = shape, scale = scale_control)

X1 <- c(X1_treat, X1_control)
W <- c(rep(1, n_treat), rep(0, n_control))

# ---------- 2. Treatment effect (heterogeneous) ----------
tau_i <- 2 * X1   # individual treatment effect

# ---------- 3. Potential outcomes (no irreducible error) ----------
alpha <- 5
gamma <- 1
Y0 <- alpha + gamma * X1
Y1 <- Y0 + tau_i

# Observed outcome
Y <- ifelse(W == 1, Y1, Y0)

df <- data.frame(Y = Y, W = W, X1 = X1, Y0 = Y0, Y1 = Y1, tau = tau_i)

# ---------- 4. Estimands ----------
ATE  <- mean(df$tau)
ATT  <- mean(df$tau[df$W == 1])
ATC  <- mean(df$tau[df$W == 0])

# ---------- 5. Estimator (naive difference in means) ----------
naive_diff <- mean(df$Y[df$W == 1]) - mean(df$Y[df$W == 0])

# ---------- 6. Selection bias & heterogeneity term ----------
selection_bias <- mean(df$Y0[df$W == 1]) - mean(df$Y0[df$W == 0])
heterogeneity  <- ATT - ATE

# ---------- 7. Decomposition checks ----------
decomp1 <- ATT + selection_bias
decomp2 <- heterogeneity + selection_bias

# Print raw numeric outputs (no rounding)
cat("---- Estimands & estimator (Q3) ----\n")

## ---- Estimands & estimator (Q3) ----

cat("Naive difference in means: ", naive_diff, "\n")

## Naive difference in means:  9.413472

cat("ATE (mean tau): ", ATE, "\n")

## ATE (mean tau):  34.38141

cat("ATT (mean tau | treated): ", ATT, "\n")

## ATT (mean tau | treated):  19.69439

cat("ATC (mean tau | control): ", ATC, "\n\n")

## ATC (mean tau | control):  40.25621

cat("Selection bias (E[Y0|W=1]-E[Y0|W=0]): ", selection_bias, "\n")

## Selection bias (E[Y0|W=1]-E[Y0|W=0]):  -10.28091

cat("Heterogeneity (ATT - ATE): ", heterogeneity, "\n\n")

## Heterogeneity (ATT - ATE):  -14.68702

cat("Check ATT + selection_bias = ", decomp1, " ; naive = ", naive_diff, "\n")

## Check ATT + selection_bias =  9.413472  ; naive =  9.413472

cat("Check (ATT-ATE) + selection_bias = naive - ATE -> ", decomp2, " = ", naive_diff - ATE, "\n")

## Check (ATT-ATE) + selection_bias = naive - ATE ->  -24.96793  =  -24.96793

Short summary (Q3)

The naive difference equals the sum of selection bias and ATT (check holds numerically above).
Because treatment effect depends linearly on X1 and the treated and control groups have different variances (but same mean), ATT and ATE will differ when X1 distributions differ in higher moments.

Question 4 — Quartic treatment effect (tau = 2 * X1^4)

# 1. Quartic individual treatment effects
tau_i_q <- 2 * (X1^4)

# 2. Potential outcomes for quartic case
Y0_q <- alpha + gamma * X1
Y1_q <- Y0_q + tau_i_q

# 3. Observed outcome
Y_q <- ifelse(W == 1, Y1_q, Y0_q)

# 4. Estimands
ATE_q <- mean(tau_i_q)
ATT_q <- mean(tau_i_q[W == 1])
ATC_q <- mean(tau_i_q[W == 0])

# 5. Naive difference in means (SDO)
SDO_q <- mean(Y_q[W == 1]) - mean(Y_q[W == 0])

# 6. Selection bias
sel_bias_q <- mean(Y0_q[W == 1]) - mean(Y0_q[W == 0])

# 7. Bias decomposition
heterog_bias_q <- ATT_q - ATE_q
total_bias_q   <- SDO_q - ATE_q

# Print raw numeric outputs (no rounding)
cat("---- Quartic case (Q4) ----\n")

## ---- Quartic case (Q4) ----

cat("ATE_q: ", ATE_q, "\n")

## ATE_q:  252710.3

cat("ATT_q: ", ATT_q, "\n")

## ATT_q:  19677

cat("ATC_q: ", ATC_q, "\n\n")

## ATC_q:  345923.7

cat("SDO_q (naive): ", SDO_q, "\n")

## SDO_q (naive):  19666.72

cat("Selection bias_q: ", sel_bias_q, "\n")

## Selection bias_q:  -10.28091

cat("Heterogeneity bias_q (ATT_q - ATE_q): ", heterog_bias_q, "\n")

## Heterogeneity bias_q (ATT_q - ATE_q):  -233033.3

cat("Total bias_q (SDO_q - ATE_q): ", total_bias_q, "\n")

## Total bias_q (SDO_q - ATE_q):  -233043.6

cat("Check: ATT_q + sel_bias_q = ", ATT_q + sel_bias_q, " ; SDO_q = ", SDO_q, "\n")

## Check: ATT_q + sel_bias_q =  19666.72  ; SDO_q =  19666.72

Short summary (Q4)

With a quartic treatment effect, heterogeneity typically increases (large deviations for extreme X1).
If treated and control groups have different dispersion, ATT can differ substantially from ATE because the treatment effect is non-linear in X1.

Question 5 — Quartic treatment + skew differences between groups

# Re-generate X1 with different skew but same mean
# Parameters to achieve different skew
mu  <- 10     # desired mean

# Treated: near-zero skew (large shape)
shape_treat <- 5000
scale_treat <- mu / shape_treat

# Control: high skew (shape = 1)
shape_ctrl <- 1
scale_ctrl <- mu / shape_ctrl

X1_treat_q5   <- rgamma(n_treat,   shape = shape_treat, scale = scale_treat)
X1_control_q5 <- rgamma(n_control, shape = shape_ctrl,  scale = scale_ctrl)

X1_q5 <- c(X1_treat_q5, X1_control_q5)
W_q5  <- c(rep(1, n_treat), rep(0, n_control))

# Quartic effects
tau_i_q5 <- 2 * (X1_q5^4)

# Potential outcomes
Y0_q5 <- alpha + gamma * X1_q5
Y1_q5 <- Y0_q5 + tau_i_q5
Y_q5  <- ifelse(W_q5 == 1, Y1_q5, Y0_q5)

# Estimands
ATE_q5 <- mean(tau_i_q5)
ATT_q5 <- mean(tau_i_q5[W_q5 == 1])
ATC_q5 <- mean(tau_i_q5[W_q5 == 0])

# Naive estimator
SDO_q5 <- mean(Y_q5[W_q5 == 1]) - mean(Y_q5[W_q5 == 0])

# Bias decomposition
sel_bias_q5    <- mean(Y0_q5[W_q5 == 1]) - mean(Y0_q5[W_q5 == 0])
heterog_bias_q5 <- ATT_q5 - ATE_q5
total_bias_q5   <- SDO_q5 - ATE_q5

# Print raw numeric outputs (no rounding)
cat("---- Q5: Quartic + Skew Difference ----\n")

## ---- Q5: Quartic + Skew Difference ----

cat("ATE_q5: ", ATE_q5, "\n")

## ATE_q5:  439129.6

cat("ATT_q5: ", ATT_q5, "\n")

## ATT_q5:  20083.69

cat("ATC_q5: ", ATC_q5, "\n\n")

## ATC_q5:  606748

cat("SDO_q5 (naive): ", SDO_q5, "\n")

## SDO_q5 (naive):  20083.76

cat("Selection bias_q5: ", sel_bias_q5, "\n")

## Selection bias_q5:  0.06483348

cat("Heterogeneity bias_q5: ", heterog_bias_q5, "\n")

## Heterogeneity bias_q5:  -419045.9

cat("Total bias_q5: ", total_bias_q5, "\n")

## Total bias_q5:  -419045.8

cat("Check: ATT_q5 + sel_bias_q5 = ", ATT_q5 + sel_bias_q5, " ; SDO_q5 = ", SDO_q5, "\n")

## Check: ATT_q5 + sel_bias_q5 =  20083.76  ; SDO_q5 =  20083.76

Short summary (Q5)

When the treated and control groups differ in skew (even when means are matched), non-linear treatment effects (like quartic) amplify the impact of distributional differences.
The naive estimator can be biased: decomposition isolates how much comes from baseline differences (selection bias) and how much from heterogeneous effects (ATT vs ATE).

Question 7 — Loading the `female_lalonde.csv` dataset

file_path <- "/Users/katiankurunzizagwaneza/Downloads/female_lalonde.csv"
if (file.exists(file_path)) {
  lal <- read.csv(file_path)
  cat("\n First few rows of data: \n")
  head(lal)
} else {
  message("File not found at: ", file_path, " — please set `file_path` to the correct location.")
}

## 
##  First few rows of data:

##   treated age educ nodegree married black hisp moa     re75      re76     re77
## 1      99  46   12        0       0     0    0  NA 11277.16  3183.434 16695.47
## 2      99  29   12        0       0     1    0  NA 21480.29 23706.422 22578.64
## 3      99  40   11        1       0     1    0  NA     0.00     0.000     0.00
## 4      99  37   12        0       0     1    0  NA     0.00     0.000     0.00
## 5      99  31   12        0       0     1    0  NA 10795.64 15578.506 13594.89
## 6      99  42    9        1       0     1    0  NA     0.00     0.000     0.00
##       re78     re79     re74 afdc75 nchildren75 zero74 zero75     redif lalonde
## 1 16953.62 17922.08 16217.85      0           0      0      0 6644.9248      NA
## 2 20687.76 21249.48 23252.09      0           0      0      0 -230.8145      NA
## 3     0.00     0.00     0.00      1           5      1      1    0.0000      NA
## 4     0.00     0.00     0.00      1           2      1      1    0.0000      NA
## 5 14481.43 13129.24 11919.14      0           0      0      0 2333.6035      NA
## 6     0.00     0.00     0.00      1           4      1      1    0.0000      NA
##   psid1 psid2 sample_lalonde sample_dw sample_early
## 1     1    NA              0         0            0
## 2     1    NA              0         0            0
## 3     1     1              0         0            0
## 4     1     1              0         0            0
## 5     1    NA              0         0            0
## 6     1     1              0         0            0

## Separate data into experimental and observational groups
rct_0 <- lal[lal$treated == 0, ]   # RCT control group
rct_1 <- lal[lal$treated == 1, ]   # RCT treatment group
psid  <- lal[lal$treated == 99, ]  # PSID observational comparison group


# 1. Regression to estimate treatment coefficients and CI interval for RCT units
rct_data <- lal[lal$treated != 99, ] # RCT data

regression_rct <- lm(rct_data$re79 ~ rct_data$age + rct_data$educ + rct_data$nodegree + rct_data$married + rct_data$black + rct_data$hisp + rct_data$re75 + rct_data$re74 + rct_data$nchildren75 + rct_data$zero74 + rct_data$zero75 + rct_data$treated, data = rct_data)
summary(regression_rct)

## 
## Call:
## lm(formula = rct_data$re79 ~ rct_data$age + rct_data$educ + rct_data$nodegree + 
##     rct_data$married + rct_data$black + rct_data$hisp + rct_data$re75 + 
##     rct_data$re74 + rct_data$nchildren75 + rct_data$zero74 + 
##     rct_data$zero75 + rct_data$treated, data = rct_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7666  -4056  -2724   4199  24941 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          1972.33872 1807.45761   1.091  0.27540   
## rct_data$age           -0.82507   22.06427  -0.037  0.97018   
## rct_data$educ         182.33368  105.36726   1.730  0.08381 . 
## rct_data$nodegree    -762.31495  437.77162  -1.741  0.08188 . 
## rct_data$married      870.16551  898.83039   0.968  0.33319   
## rct_data$black        229.61323  709.20579   0.324  0.74618   
## rct_data$hisp         908.58533  829.37418   1.096  0.27352   
## rct_data$re75           0.16021    0.31893   0.502  0.61552   
## rct_data$re74          -0.02356    0.30040  -0.078  0.93751   
## rct_data$nchildren75   99.17803  119.36985   0.831  0.40623   
## rct_data$zero74      -907.72359 1292.61438  -0.702  0.48267   
## rct_data$zero75       741.29602 1216.25319   0.609  0.54232   
## rct_data$treated      868.89227  307.41041   2.826  0.00479 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5276 on 1173 degrees of freedom
##   (416 observations deleted due to missingness)
## Multiple R-squared:  0.02661,    Adjusted R-squared:  0.01665 
## F-statistic: 2.672 on 12 and 1173 DF,  p-value: 0.001503

confint(regression_rct, "rct_data$treated")

##                     2.5 %   97.5 %
## rct_data$treated 265.7566 1472.028

We dropped about 416 observations due to missing data on running the lm regression. That might have led to a wider confidence interval.

# 2. Create an observational dataset by deleting RCT controls and relabeling PSID units (treat=99) as 0.
obs_data <- rbind(rct_1, psid)
obs_data$treated[obs_data$treated == 99] <- 0
table(obs_data$treated)

# 3. Perform genetic matching (using the Matching library) with: pop.size = 100, wait.generations = 10, max.generations = 100

install.packages("Matching")
install.packages("rgenoud")
library(Matching)
library(rgenoud)

#Remove rows with missing data
# How many rows total
nrow(obs_data)
# columns to require non-missing
required <- c("treated","re79",
              "age", "educ", "nodegree", "married", "black", "hisp",
              "re75", "re74", "nchildren75", "zero74", "zero75")

initial_n <- nrow(obs_data)
obs_data <- obs_data[complete.cases(obs_data[, required]), ]
print(table(obs_data$treated))
cat("Dropped", initial_n - nrow(obs_data), "rows; remaining:", nrow(obs_data), "\n")

# Outcome and treatment
Y  <- obs_data$re79
Tr <- obs_data$treated

# Covariate matrix (base R specification)
X <- as.matrix(obs_data[, c(
  "age", "educ", "nodegree", "married", "black", "hisp",
  "re75", "re74", "nchildren75", "zero74", "zero75"
)])

# Genetic search for optimal weights
genout <- GenMatch( Tr = Tr, X  = X, pop.size = 100, wait.generations = 10, max.generations = 100) 


# Matching using optimal weights
mout <- Match( Y = Y, Tr = Tr, X = X, Weight.matrix = genout)
summary(mout)
# 4. Report balance results and lowest p-value.
# --- MatchBalance: bootstrap balance check (safe to missing covariates) ---
bal <- MatchBalance(
  Tr ~ age + educ + nodegree + married + black + hisp +
        re75 + re74 + nchildren75 + zero74 + zero75,
  match.out = mout,
  nboots = 500
)
print(bal)

# --- Balance checks and treatment-effect estimation ---
covars <- c("age", "educ", "nodegree", "married", "black", "hisp",
            "re75", "re74", "nchildren75", "zero74", "zero75")

# P-values before matching
before_pvals <- sapply(covars, function(v) {
  t.test(obs_data[[v]] ~ obs_data$treated)$p.value
})
names(before_pvals) <- covars

# P-values after matching (compare matched treated vs matched controls)
matched_t <- obs_data[mout$index.treated, ]
matched_c <- obs_data[mout$index.control, ]
after_pvals <- sapply(covars, function(v) {
  t.test(matched_t[[v]], matched_c[[v]])$p.value
})
names(after_pvals) <- covars

cat("Lowest p-value after matching:", format(min(after_pvals), digits = 4), "\n")
cat("P-values after matching:\n")
print(after_pvals)

# 5. Estimate treatment effect, standard error, and (bonus) confidence interval.

# ATT from matched pairs (direct paired differences)
pair_diffs <- Y[mout$index.treated] - Y[mout$index.control]
n_pairs <- length(pair_diffs)
att_est <- mean(pair_diffs)
att_se  <- sd(pair_diffs) / sqrt(n_pairs)
ci_mult <- qt(0.975, df = n_pairs - 1)
ci_lower <- att_est - ci_mult * att_se
ci_upper <- att_est + ci_mult * att_se
cat(sprintf("Estimated ATT = %.4f; SE = %.4f; 95%% CI = [%.4f, %.4f]\n",
            att_est, att_se, ci_lower, ci_upper))

# Also show Match() reported estimate/se if present
if (!is.null(mout$est)) cat(sprintf("Match() reported est = %.4f\n", mout$est))
if (!is.null(mout$se))  cat(sprintf("Match() reported se  = %.4f\n", mout$se))

# 6. Using the quantreg library, estimate and plot quantile treatment effects with CIs.

matched_treated  <- obs_data[mout$index.treated, ]
matched_control  <- obs_data[mout$index.control, ]

matched_data <- rbind(
  transform(matched_treated,  matched_pair = 1:length(mout$index.treated)),
  transform(matched_control, matched_pair = 1:length(mout$index.control))
)

###############################################
# Quantile Treatment Effects (quantreg version)
###############################################

library(quantreg)

# Construct matched dataset for rq
matched_treated  <- obs_data[mout$index.treated, ]
matched_control  <- obs_data[mout$index.control, ]

matched_treated$treated  <- 1
matched_control$treated  <- 0

matched_data <- rbind(matched_treated, matched_control)

# Quantiles to analyze
taus <- seq(0.1, 0.9, by = 0.1)

# Storage
qte_est  <- numeric(length(taus))
qte_low  <- numeric(length(taus))
qte_high <- numeric(length(taus))

for (i in seq_along(taus)) {
  tau <- taus[i]

  rq_fit <- rq(re79 ~ treated, tau = tau, data = matched_data)

  # Bootstrapped SEs for CI
  rq_sum <- summary(rq_fit, se = "boot", R = 500)

  est <- rq_sum$coefficients["treated", "Value"]
  se  <- rq_sum$coefficients["treated", "Std. Error"]

  # 95% CI
  qte_est[i]  <- est
  qte_low[i]  <- est - 1.96 * se
  qte_high[i] <- est + 1.96 * se
}

# --------------------------
# Plot QTEs with 95% CIs
# --------------------------
plot(
  taus, qte_est, type = "b", pch = 19,
  ylim = range(qte_low, qte_high),
  xlab = "Quantile (tau)", ylab = "Quantile Treatment Effect",
  main = "Quantile Treatment Effects with 95% CI (Matched Sample)"
)

# CI bands
segments(taus, qte_low, taus, qte_high, lwd = 2)
abline(h = 0, lty = 2)

Question 8

library(tidyverse)
library(haven)
library(Synth)
library(devtools)
library(SCtools)

read_data <- function(df)
{
  full_path <- paste("https://github.com/scunning1975/mixtape/raw/master/", 
                     df, sep = "")
  df <- read_dta(full_path)
  return(df)
}

texas <- read_data("texas.dta") %>%
  as.data.frame(.)

dataprep_out <- dataprep(
  foo = texas,
  predictors = c("poverty", "income"),
  predictors.op = "mean",
  time.predictors.prior = 1985:1993,
  special.predictors = list(
    list("bmprison", c(1988, 1990:1992), "mean"),
    list("alcohol", 1990, "mean"),
    list("aidscapita", 1990:1991, "mean"),
    list("black", 1990:1992, "mean"),
    list("perc1519", 1990, "mean")),
  dependent = "bmprison",
  unit.variable = "statefip",
  unit.names.variable = "state",
  time.variable = "year",
  treatment.identifier = 48,
  controls.identifier = c(1,2,4:6,8:13,15:42,44:47,49:51,53:56),
  time.optimize.ssr = 1985:1993,
  time.plot = 1985:2000
)

synth_out <- synth(data.prep.obj = dataprep_out)

path.plot(synth_out, dataprep_out)
gaps.plot(synth_out, dataprep_out)
placebos <- generate.placebos(dataprep_out, synth_out, Sigf.ipop = 3)

plot_placebos(placebos)

mspe.plot(placebos, discard.extreme = TRUE, mspe.limit = 1, plot.hist = TRUE)

Appendix

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.0
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Argentina/Buenos_Aires
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_1.1.4
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.37     R6_2.6.1          fastmap_1.2.0     tidyselect_1.2.1 
##  [5] xfun_0.53         magrittr_2.0.3    glue_1.8.0        cachem_1.1.0     
##  [9] tibble_3.3.0      knitr_1.50        pkgconfig_2.0.3   htmltools_0.5.8.1
## [13] generics_0.1.4    rmarkdown_2.30    lifecycle_1.0.4   cli_3.6.5        
## [17] vctrs_0.6.5       sass_0.4.10       jquerylib_0.1.4   compiler_4.5.1   
## [21] rstudioapi_0.17.1 tools_4.5.1       pillar_1.11.0     evaluate_1.0.5   
## [25] bslib_0.9.0       rlang_1.1.6       jsonlite_2.0.0