title: “RCT simulation & bias decomposition” author: “Katia” date: “2025-11-27” output: html_document ———————

# Knit-friendly setup
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
set.seed(123)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Overview

This R Markdown reproduces the simulations and bias-decomposition exercises from Questions 3–5 and includes a short snippet for loading the female_lalonde.csv file (Question 7). Each section contains the data generation, estimands, naive estimator, bias decomposition, and checks. At the end of each section there is a brief summary of the results you can expand on for a report.

Question 3 — Gamma X1 with same mean, different variance

# ---------- 1. Data generation ----------
n_treat   <- 40
n_control <- 100

mean_target <- 10

# Fixed shape so skewness is identical
shape <- 100                 # skew = 2/sqrt(100) = 0.2

# Scale values chosen to force SD = 1 vs SD = 2
scale_treat   <- mean_target / shape            # = 0.1  → sd ≈ 1
scale_control <- (mean_target * 2) / shape      # = 0.2  → sd ≈ 2

# Generate Gamma X1 values
X1_treat   <- rgamma(n_treat,   shape = shape, scale = scale_treat)
X1_control <- rgamma(n_control, shape = shape, scale = scale_control)

X1 <- c(X1_treat, X1_control)
W <- c(rep(1, n_treat), rep(0, n_control))

# ---------- 2. Treatment effect (heterogeneous) ----------
tau_i <- 2 * X1   # individual treatment effect

# ---------- 3. Potential outcomes (no irreducible error) ----------
alpha <- 5
gamma <- 1
Y0 <- alpha + gamma * X1
Y1 <- Y0 + tau_i

# Observed outcome
Y <- ifelse(W == 1, Y1, Y0)

df <- data.frame(Y = Y, W = W, X1 = X1, Y0 = Y0, Y1 = Y1, tau = tau_i)

# ---------- 4. Estimands ----------
ATE  <- mean(df$tau)
ATT  <- mean(df$tau[df$W == 1])
ATC  <- mean(df$tau[df$W == 0])

# ---------- 5. Estimator (naive difference in means) ----------
naive_diff <- mean(df$Y[df$W == 1]) - mean(df$Y[df$W == 0])

# ---------- 6. Selection bias & heterogeneity term ----------
selection_bias <- mean(df$Y0[df$W == 1]) - mean(df$Y0[df$W == 0])
heterogeneity  <- ATT - ATE

# ---------- 7. Decomposition checks ----------
decomp1 <- ATT + selection_bias
decomp2 <- heterogeneity + selection_bias

# Print raw numeric outputs (no rounding)
cat("---- Estimands & estimator (Q3) ----\n")
## ---- Estimands & estimator (Q3) ----
cat("Naive difference in means: ", naive_diff, "\n")
## Naive difference in means:  9.413472
cat("ATE (mean tau): ", ATE, "\n")
## ATE (mean tau):  34.38141
cat("ATT (mean tau | treated): ", ATT, "\n")
## ATT (mean tau | treated):  19.69439
cat("ATC (mean tau | control): ", ATC, "\n\n")
## ATC (mean tau | control):  40.25621
cat("Selection bias (E[Y0|W=1]-E[Y0|W=0]): ", selection_bias, "\n")
## Selection bias (E[Y0|W=1]-E[Y0|W=0]):  -10.28091
cat("Heterogeneity (ATT - ATE): ", heterogeneity, "\n\n")
## Heterogeneity (ATT - ATE):  -14.68702
cat("Check ATT + selection_bias = ", decomp1, " ; naive = ", naive_diff, "\n")
## Check ATT + selection_bias =  9.413472  ; naive =  9.413472
cat("Check (ATT-ATE) + selection_bias = naive - ATE -> ", decomp2, " = ", naive_diff - ATE, "\n")
## Check (ATT-ATE) + selection_bias = naive - ATE ->  -24.96793  =  -24.96793

Short summary (Q3)

Question 4 — Quartic treatment effect (tau = 2 * X1^4)

# 1. Quartic individual treatment effects
tau_i_q <- 2 * (X1^4)

# 2. Potential outcomes for quartic case
Y0_q <- alpha + gamma * X1
Y1_q <- Y0_q + tau_i_q

# 3. Observed outcome
Y_q <- ifelse(W == 1, Y1_q, Y0_q)

# 4. Estimands
ATE_q <- mean(tau_i_q)
ATT_q <- mean(tau_i_q[W == 1])
ATC_q <- mean(tau_i_q[W == 0])

# 5. Naive difference in means (SDO)
SDO_q <- mean(Y_q[W == 1]) - mean(Y_q[W == 0])

# 6. Selection bias
sel_bias_q <- mean(Y0_q[W == 1]) - mean(Y0_q[W == 0])

# 7. Bias decomposition
heterog_bias_q <- ATT_q - ATE_q
total_bias_q   <- SDO_q - ATE_q

# Print raw numeric outputs (no rounding)
cat("---- Quartic case (Q4) ----\n")
## ---- Quartic case (Q4) ----
cat("ATE_q: ", ATE_q, "\n")
## ATE_q:  252710.3
cat("ATT_q: ", ATT_q, "\n")
## ATT_q:  19677
cat("ATC_q: ", ATC_q, "\n\n")
## ATC_q:  345923.7
cat("SDO_q (naive): ", SDO_q, "\n")
## SDO_q (naive):  19666.72
cat("Selection bias_q: ", sel_bias_q, "\n")
## Selection bias_q:  -10.28091
cat("Heterogeneity bias_q (ATT_q - ATE_q): ", heterog_bias_q, "\n")
## Heterogeneity bias_q (ATT_q - ATE_q):  -233033.3
cat("Total bias_q (SDO_q - ATE_q): ", total_bias_q, "\n")
## Total bias_q (SDO_q - ATE_q):  -233043.6
cat("Check: ATT_q + sel_bias_q = ", ATT_q + sel_bias_q, " ; SDO_q = ", SDO_q, "\n")
## Check: ATT_q + sel_bias_q =  19666.72  ; SDO_q =  19666.72

Short summary (Q4)

Question 5 — Quartic treatment + skew differences between groups

# Re-generate X1 with different skew but same mean
# Parameters to achieve different skew
mu  <- 10     # desired mean

# Treated: near-zero skew (large shape)
shape_treat <- 5000
scale_treat <- mu / shape_treat

# Control: high skew (shape = 1)
shape_ctrl <- 1
scale_ctrl <- mu / shape_ctrl

X1_treat_q5   <- rgamma(n_treat,   shape = shape_treat, scale = scale_treat)
X1_control_q5 <- rgamma(n_control, shape = shape_ctrl,  scale = scale_ctrl)

X1_q5 <- c(X1_treat_q5, X1_control_q5)
W_q5  <- c(rep(1, n_treat), rep(0, n_control))

# Quartic effects
tau_i_q5 <- 2 * (X1_q5^4)

# Potential outcomes
Y0_q5 <- alpha + gamma * X1_q5
Y1_q5 <- Y0_q5 + tau_i_q5
Y_q5  <- ifelse(W_q5 == 1, Y1_q5, Y0_q5)

# Estimands
ATE_q5 <- mean(tau_i_q5)
ATT_q5 <- mean(tau_i_q5[W_q5 == 1])
ATC_q5 <- mean(tau_i_q5[W_q5 == 0])

# Naive estimator
SDO_q5 <- mean(Y_q5[W_q5 == 1]) - mean(Y_q5[W_q5 == 0])

# Bias decomposition
sel_bias_q5    <- mean(Y0_q5[W_q5 == 1]) - mean(Y0_q5[W_q5 == 0])
heterog_bias_q5 <- ATT_q5 - ATE_q5
total_bias_q5   <- SDO_q5 - ATE_q5

# Print raw numeric outputs (no rounding)
cat("---- Q5: Quartic + Skew Difference ----\n")
## ---- Q5: Quartic + Skew Difference ----
cat("ATE_q5: ", ATE_q5, "\n")
## ATE_q5:  439129.6
cat("ATT_q5: ", ATT_q5, "\n")
## ATT_q5:  20083.69
cat("ATC_q5: ", ATC_q5, "\n\n")
## ATC_q5:  606748
cat("SDO_q5 (naive): ", SDO_q5, "\n")
## SDO_q5 (naive):  20083.76
cat("Selection bias_q5: ", sel_bias_q5, "\n")
## Selection bias_q5:  0.06483348
cat("Heterogeneity bias_q5: ", heterog_bias_q5, "\n")
## Heterogeneity bias_q5:  -419045.9
cat("Total bias_q5: ", total_bias_q5, "\n")
## Total bias_q5:  -419045.8
cat("Check: ATT_q5 + sel_bias_q5 = ", ATT_q5 + sel_bias_q5, " ; SDO_q5 = ", SDO_q5, "\n")
## Check: ATT_q5 + sel_bias_q5 =  20083.76  ; SDO_q5 =  20083.76

Short summary (Q5)

Question 7 — Loading the female_lalonde.csv dataset

file_path <- "/Users/katiankurunzizagwaneza/Downloads/female_lalonde.csv"
if (file.exists(file_path)) {
  lal <- read.csv(file_path)
  cat("\n First few rows of data: \n")
  head(lal)
} else {
  message("File not found at: ", file_path, " — please set `file_path` to the correct location.")
}
## 
##  First few rows of data:
##   treated age educ nodegree married black hisp moa     re75      re76     re77
## 1      99  46   12        0       0     0    0  NA 11277.16  3183.434 16695.47
## 2      99  29   12        0       0     1    0  NA 21480.29 23706.422 22578.64
## 3      99  40   11        1       0     1    0  NA     0.00     0.000     0.00
## 4      99  37   12        0       0     1    0  NA     0.00     0.000     0.00
## 5      99  31   12        0       0     1    0  NA 10795.64 15578.506 13594.89
## 6      99  42    9        1       0     1    0  NA     0.00     0.000     0.00
##       re78     re79     re74 afdc75 nchildren75 zero74 zero75     redif lalonde
## 1 16953.62 17922.08 16217.85      0           0      0      0 6644.9248      NA
## 2 20687.76 21249.48 23252.09      0           0      0      0 -230.8145      NA
## 3     0.00     0.00     0.00      1           5      1      1    0.0000      NA
## 4     0.00     0.00     0.00      1           2      1      1    0.0000      NA
## 5 14481.43 13129.24 11919.14      0           0      0      0 2333.6035      NA
## 6     0.00     0.00     0.00      1           4      1      1    0.0000      NA
##   psid1 psid2 sample_lalonde sample_dw sample_early
## 1     1    NA              0         0            0
## 2     1    NA              0         0            0
## 3     1     1              0         0            0
## 4     1     1              0         0            0
## 5     1    NA              0         0            0
## 6     1     1              0         0            0
## Separate data into experimental and observational groups
rct_0 <- lal[lal$treated == 0, ]   # RCT control group
rct_1 <- lal[lal$treated == 1, ]   # RCT treatment group
psid  <- lal[lal$treated == 99, ]  # PSID observational comparison group


# 1. Regression to estimate treatment coefficients and CI interval for RCT units
rct_data <- lal[lal$treated != 99, ] # RCT data

regression_rct <- lm(rct_data$re79 ~ rct_data$age + rct_data$educ + rct_data$nodegree + rct_data$married + rct_data$black + rct_data$hisp + rct_data$re75 + rct_data$re74 + rct_data$nchildren75 + rct_data$zero74 + rct_data$zero75 + rct_data$treated, data = rct_data)
summary(regression_rct)
## 
## Call:
## lm(formula = rct_data$re79 ~ rct_data$age + rct_data$educ + rct_data$nodegree + 
##     rct_data$married + rct_data$black + rct_data$hisp + rct_data$re75 + 
##     rct_data$re74 + rct_data$nchildren75 + rct_data$zero74 + 
##     rct_data$zero75 + rct_data$treated, data = rct_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -7666  -4056  -2724   4199  24941 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          1972.33872 1807.45761   1.091  0.27540   
## rct_data$age           -0.82507   22.06427  -0.037  0.97018   
## rct_data$educ         182.33368  105.36726   1.730  0.08381 . 
## rct_data$nodegree    -762.31495  437.77162  -1.741  0.08188 . 
## rct_data$married      870.16551  898.83039   0.968  0.33319   
## rct_data$black        229.61323  709.20579   0.324  0.74618   
## rct_data$hisp         908.58533  829.37418   1.096  0.27352   
## rct_data$re75           0.16021    0.31893   0.502  0.61552   
## rct_data$re74          -0.02356    0.30040  -0.078  0.93751   
## rct_data$nchildren75   99.17803  119.36985   0.831  0.40623   
## rct_data$zero74      -907.72359 1292.61438  -0.702  0.48267   
## rct_data$zero75       741.29602 1216.25319   0.609  0.54232   
## rct_data$treated      868.89227  307.41041   2.826  0.00479 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5276 on 1173 degrees of freedom
##   (416 observations deleted due to missingness)
## Multiple R-squared:  0.02661,    Adjusted R-squared:  0.01665 
## F-statistic: 2.672 on 12 and 1173 DF,  p-value: 0.001503
confint(regression_rct, "rct_data$treated")
##                     2.5 %   97.5 %
## rct_data$treated 265.7566 1472.028

We dropped about 416 observations due to missing data on running the lm regression. That might have led to a wider confidence interval.

# 2. Create an observational dataset by deleting RCT controls and relabeling PSID units (treat=99) as 0.
obs_data <- rbind(rct_1, psid)
obs_data$treated[obs_data$treated == 99] <- 0
table(obs_data$treated)

# 3. Perform genetic matching (using the Matching library) with: pop.size = 100, wait.generations = 10, max.generations = 100

install.packages("Matching")
install.packages("rgenoud")
library(Matching)
library(rgenoud)

#Remove rows with missing data
# How many rows total
nrow(obs_data)
# columns to require non-missing
required <- c("treated","re79",
              "age", "educ", "nodegree", "married", "black", "hisp",
              "re75", "re74", "nchildren75", "zero74", "zero75")

initial_n <- nrow(obs_data)
obs_data <- obs_data[complete.cases(obs_data[, required]), ]
print(table(obs_data$treated))
cat("Dropped", initial_n - nrow(obs_data), "rows; remaining:", nrow(obs_data), "\n")

# Outcome and treatment
Y  <- obs_data$re79
Tr <- obs_data$treated

# Covariate matrix (base R specification)
X <- as.matrix(obs_data[, c(
  "age", "educ", "nodegree", "married", "black", "hisp",
  "re75", "re74", "nchildren75", "zero74", "zero75"
)])

# Genetic search for optimal weights
genout <- GenMatch( Tr = Tr, X  = X, pop.size = 100, wait.generations = 10, max.generations = 100) 


# Matching using optimal weights
mout <- Match( Y = Y, Tr = Tr, X = X, Weight.matrix = genout)
summary(mout)
# 4. Report balance results and lowest p-value.
# --- MatchBalance: bootstrap balance check (safe to missing covariates) ---
bal <- MatchBalance(
  Tr ~ age + educ + nodegree + married + black + hisp +
        re75 + re74 + nchildren75 + zero74 + zero75,
  match.out = mout,
  nboots = 500
)
print(bal)

# --- Balance checks and treatment-effect estimation ---
covars <- c("age", "educ", "nodegree", "married", "black", "hisp",
            "re75", "re74", "nchildren75", "zero74", "zero75")

# P-values before matching
before_pvals <- sapply(covars, function(v) {
  t.test(obs_data[[v]] ~ obs_data$treated)$p.value
})
names(before_pvals) <- covars

# P-values after matching (compare matched treated vs matched controls)
matched_t <- obs_data[mout$index.treated, ]
matched_c <- obs_data[mout$index.control, ]
after_pvals <- sapply(covars, function(v) {
  t.test(matched_t[[v]], matched_c[[v]])$p.value
})
names(after_pvals) <- covars

cat("Lowest p-value after matching:", format(min(after_pvals), digits = 4), "\n")
cat("P-values after matching:\n")
print(after_pvals)

# 5. Estimate treatment effect, standard error, and (bonus) confidence interval.

# ATT from matched pairs (direct paired differences)
pair_diffs <- Y[mout$index.treated] - Y[mout$index.control]
n_pairs <- length(pair_diffs)
att_est <- mean(pair_diffs)
att_se  <- sd(pair_diffs) / sqrt(n_pairs)
ci_mult <- qt(0.975, df = n_pairs - 1)
ci_lower <- att_est - ci_mult * att_se
ci_upper <- att_est + ci_mult * att_se
cat(sprintf("Estimated ATT = %.4f; SE = %.4f; 95%% CI = [%.4f, %.4f]\n",
            att_est, att_se, ci_lower, ci_upper))

# Also show Match() reported estimate/se if present
if (!is.null(mout$est)) cat(sprintf("Match() reported est = %.4f\n", mout$est))
if (!is.null(mout$se))  cat(sprintf("Match() reported se  = %.4f\n", mout$se))

# 6. Using the quantreg library, estimate and plot quantile treatment effects with CIs.

matched_treated  <- obs_data[mout$index.treated, ]
matched_control  <- obs_data[mout$index.control, ]

matched_data <- rbind(
  transform(matched_treated,  matched_pair = 1:length(mout$index.treated)),
  transform(matched_control, matched_pair = 1:length(mout$index.control))
)

###############################################
# Quantile Treatment Effects (quantreg version)
###############################################

library(quantreg)

# Construct matched dataset for rq
matched_treated  <- obs_data[mout$index.treated, ]
matched_control  <- obs_data[mout$index.control, ]

matched_treated$treated  <- 1
matched_control$treated  <- 0

matched_data <- rbind(matched_treated, matched_control)

# Quantiles to analyze
taus <- seq(0.1, 0.9, by = 0.1)

# Storage
qte_est  <- numeric(length(taus))
qte_low  <- numeric(length(taus))
qte_high <- numeric(length(taus))

for (i in seq_along(taus)) {
  tau <- taus[i]

  rq_fit <- rq(re79 ~ treated, tau = tau, data = matched_data)

  # Bootstrapped SEs for CI
  rq_sum <- summary(rq_fit, se = "boot", R = 500)

  est <- rq_sum$coefficients["treated", "Value"]
  se  <- rq_sum$coefficients["treated", "Std. Error"]

  # 95% CI
  qte_est[i]  <- est
  qte_low[i]  <- est - 1.96 * se
  qte_high[i] <- est + 1.96 * se
}

# --------------------------
# Plot QTEs with 95% CIs
# --------------------------
plot(
  taus, qte_est, type = "b", pch = 19,
  ylim = range(qte_low, qte_high),
  xlab = "Quantile (tau)", ylab = "Quantile Treatment Effect",
  main = "Quantile Treatment Effects with 95% CI (Matched Sample)"
)

# CI bands
segments(taus, qte_low, taus, qte_high, lwd = 2)
abline(h = 0, lty = 2)

Question 8

library(tidyverse)
library(haven)
library(Synth)
library(devtools)
library(SCtools)

read_data <- function(df)
{
  full_path <- paste("https://github.com/scunning1975/mixtape/raw/master/", 
                     df, sep = "")
  df <- read_dta(full_path)
  return(df)
}

texas <- read_data("texas.dta") %>%
  as.data.frame(.)

dataprep_out <- dataprep(
  foo = texas,
  predictors = c("poverty", "income"),
  predictors.op = "mean",
  time.predictors.prior = 1985:1993,
  special.predictors = list(
    list("bmprison", c(1988, 1990:1992), "mean"),
    list("alcohol", 1990, "mean"),
    list("aidscapita", 1990:1991, "mean"),
    list("black", 1990:1992, "mean"),
    list("perc1519", 1990, "mean")),
  dependent = "bmprison",
  unit.variable = "statefip",
  unit.names.variable = "state",
  time.variable = "year",
  treatment.identifier = 48,
  controls.identifier = c(1,2,4:6,8:13,15:42,44:47,49:51,53:56),
  time.optimize.ssr = 1985:1993,
  time.plot = 1985:2000
)

synth_out <- synth(data.prep.obj = dataprep_out)

path.plot(synth_out, dataprep_out)
gaps.plot(synth_out, dataprep_out)
placebos <- generate.placebos(dataprep_out, synth_out, Sigf.ipop = 3)

plot_placebos(placebos)

mspe.plot(placebos, discard.extreme = TRUE, mspe.limit = 1, plot.hist = TRUE)

Appendix

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.0
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Argentina/Buenos_Aires
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_1.1.4
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.37     R6_2.6.1          fastmap_1.2.0     tidyselect_1.2.1 
##  [5] xfun_0.53         magrittr_2.0.3    glue_1.8.0        cachem_1.1.0     
##  [9] tibble_3.3.0      knitr_1.50        pkgconfig_2.0.3   htmltools_0.5.8.1
## [13] generics_0.1.4    rmarkdown_2.30    lifecycle_1.0.4   cli_3.6.5        
## [17] vctrs_0.6.5       sass_0.4.10       jquerylib_0.1.4   compiler_4.5.1   
## [21] rstudioapi_0.17.1 tools_4.5.1       pillar_1.11.0     evaluate_1.0.5   
## [25] bslib_0.9.0       rlang_1.1.6       jsonlite_2.0.0