title: “RCT simulation & bias decomposition” author: “Katia” date: “2025-11-27” output: html_document ———————
# Knit-friendly setup
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
set.seed(123)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
This R Markdown reproduces the simulations and bias-decomposition
exercises from Questions 3–5 and includes a short snippet for loading
the female_lalonde.csv file (Question 7). Each section
contains the data generation, estimands, naive estimator, bias
decomposition, and checks. At the end of each section there is a brief
summary of the results you can expand on for a report.
# ---------- 1. Data generation ----------
n_treat <- 40
n_control <- 100
mean_target <- 10
# Fixed shape so skewness is identical
shape <- 100 # skew = 2/sqrt(100) = 0.2
# Scale values chosen to force SD = 1 vs SD = 2
scale_treat <- mean_target / shape # = 0.1 → sd ≈ 1
scale_control <- (mean_target * 2) / shape # = 0.2 → sd ≈ 2
# Generate Gamma X1 values
X1_treat <- rgamma(n_treat, shape = shape, scale = scale_treat)
X1_control <- rgamma(n_control, shape = shape, scale = scale_control)
X1 <- c(X1_treat, X1_control)
W <- c(rep(1, n_treat), rep(0, n_control))
# ---------- 2. Treatment effect (heterogeneous) ----------
tau_i <- 2 * X1 # individual treatment effect
# ---------- 3. Potential outcomes (no irreducible error) ----------
alpha <- 5
gamma <- 1
Y0 <- alpha + gamma * X1
Y1 <- Y0 + tau_i
# Observed outcome
Y <- ifelse(W == 1, Y1, Y0)
df <- data.frame(Y = Y, W = W, X1 = X1, Y0 = Y0, Y1 = Y1, tau = tau_i)
# ---------- 4. Estimands ----------
ATE <- mean(df$tau)
ATT <- mean(df$tau[df$W == 1])
ATC <- mean(df$tau[df$W == 0])
# ---------- 5. Estimator (naive difference in means) ----------
naive_diff <- mean(df$Y[df$W == 1]) - mean(df$Y[df$W == 0])
# ---------- 6. Selection bias & heterogeneity term ----------
selection_bias <- mean(df$Y0[df$W == 1]) - mean(df$Y0[df$W == 0])
heterogeneity <- ATT - ATE
# ---------- 7. Decomposition checks ----------
decomp1 <- ATT + selection_bias
decomp2 <- heterogeneity + selection_bias
# Print raw numeric outputs (no rounding)
cat("---- Estimands & estimator (Q3) ----\n")
## ---- Estimands & estimator (Q3) ----
cat("Naive difference in means: ", naive_diff, "\n")
## Naive difference in means: 9.413472
cat("ATE (mean tau): ", ATE, "\n")
## ATE (mean tau): 34.38141
cat("ATT (mean tau | treated): ", ATT, "\n")
## ATT (mean tau | treated): 19.69439
cat("ATC (mean tau | control): ", ATC, "\n\n")
## ATC (mean tau | control): 40.25621
cat("Selection bias (E[Y0|W=1]-E[Y0|W=0]): ", selection_bias, "\n")
## Selection bias (E[Y0|W=1]-E[Y0|W=0]): -10.28091
cat("Heterogeneity (ATT - ATE): ", heterogeneity, "\n\n")
## Heterogeneity (ATT - ATE): -14.68702
cat("Check ATT + selection_bias = ", decomp1, " ; naive = ", naive_diff, "\n")
## Check ATT + selection_bias = 9.413472 ; naive = 9.413472
cat("Check (ATT-ATE) + selection_bias = naive - ATE -> ", decomp2, " = ", naive_diff - ATE, "\n")
## Check (ATT-ATE) + selection_bias = naive - ATE -> -24.96793 = -24.96793
Short summary (Q3)
X1 and the
treated and control groups have different variances (but same mean), ATT
and ATE will differ when X1 distributions differ in higher
moments.# 1. Quartic individual treatment effects
tau_i_q <- 2 * (X1^4)
# 2. Potential outcomes for quartic case
Y0_q <- alpha + gamma * X1
Y1_q <- Y0_q + tau_i_q
# 3. Observed outcome
Y_q <- ifelse(W == 1, Y1_q, Y0_q)
# 4. Estimands
ATE_q <- mean(tau_i_q)
ATT_q <- mean(tau_i_q[W == 1])
ATC_q <- mean(tau_i_q[W == 0])
# 5. Naive difference in means (SDO)
SDO_q <- mean(Y_q[W == 1]) - mean(Y_q[W == 0])
# 6. Selection bias
sel_bias_q <- mean(Y0_q[W == 1]) - mean(Y0_q[W == 0])
# 7. Bias decomposition
heterog_bias_q <- ATT_q - ATE_q
total_bias_q <- SDO_q - ATE_q
# Print raw numeric outputs (no rounding)
cat("---- Quartic case (Q4) ----\n")
## ---- Quartic case (Q4) ----
cat("ATE_q: ", ATE_q, "\n")
## ATE_q: 252710.3
cat("ATT_q: ", ATT_q, "\n")
## ATT_q: 19677
cat("ATC_q: ", ATC_q, "\n\n")
## ATC_q: 345923.7
cat("SDO_q (naive): ", SDO_q, "\n")
## SDO_q (naive): 19666.72
cat("Selection bias_q: ", sel_bias_q, "\n")
## Selection bias_q: -10.28091
cat("Heterogeneity bias_q (ATT_q - ATE_q): ", heterog_bias_q, "\n")
## Heterogeneity bias_q (ATT_q - ATE_q): -233033.3
cat("Total bias_q (SDO_q - ATE_q): ", total_bias_q, "\n")
## Total bias_q (SDO_q - ATE_q): -233043.6
cat("Check: ATT_q + sel_bias_q = ", ATT_q + sel_bias_q, " ; SDO_q = ", SDO_q, "\n")
## Check: ATT_q + sel_bias_q = 19666.72 ; SDO_q = 19666.72
Short summary (Q4)
X1).X1.# Re-generate X1 with different skew but same mean
# Parameters to achieve different skew
mu <- 10 # desired mean
# Treated: near-zero skew (large shape)
shape_treat <- 5000
scale_treat <- mu / shape_treat
# Control: high skew (shape = 1)
shape_ctrl <- 1
scale_ctrl <- mu / shape_ctrl
X1_treat_q5 <- rgamma(n_treat, shape = shape_treat, scale = scale_treat)
X1_control_q5 <- rgamma(n_control, shape = shape_ctrl, scale = scale_ctrl)
X1_q5 <- c(X1_treat_q5, X1_control_q5)
W_q5 <- c(rep(1, n_treat), rep(0, n_control))
# Quartic effects
tau_i_q5 <- 2 * (X1_q5^4)
# Potential outcomes
Y0_q5 <- alpha + gamma * X1_q5
Y1_q5 <- Y0_q5 + tau_i_q5
Y_q5 <- ifelse(W_q5 == 1, Y1_q5, Y0_q5)
# Estimands
ATE_q5 <- mean(tau_i_q5)
ATT_q5 <- mean(tau_i_q5[W_q5 == 1])
ATC_q5 <- mean(tau_i_q5[W_q5 == 0])
# Naive estimator
SDO_q5 <- mean(Y_q5[W_q5 == 1]) - mean(Y_q5[W_q5 == 0])
# Bias decomposition
sel_bias_q5 <- mean(Y0_q5[W_q5 == 1]) - mean(Y0_q5[W_q5 == 0])
heterog_bias_q5 <- ATT_q5 - ATE_q5
total_bias_q5 <- SDO_q5 - ATE_q5
# Print raw numeric outputs (no rounding)
cat("---- Q5: Quartic + Skew Difference ----\n")
## ---- Q5: Quartic + Skew Difference ----
cat("ATE_q5: ", ATE_q5, "\n")
## ATE_q5: 439129.6
cat("ATT_q5: ", ATT_q5, "\n")
## ATT_q5: 20083.69
cat("ATC_q5: ", ATC_q5, "\n\n")
## ATC_q5: 606748
cat("SDO_q5 (naive): ", SDO_q5, "\n")
## SDO_q5 (naive): 20083.76
cat("Selection bias_q5: ", sel_bias_q5, "\n")
## Selection bias_q5: 0.06483348
cat("Heterogeneity bias_q5: ", heterog_bias_q5, "\n")
## Heterogeneity bias_q5: -419045.9
cat("Total bias_q5: ", total_bias_q5, "\n")
## Total bias_q5: -419045.8
cat("Check: ATT_q5 + sel_bias_q5 = ", ATT_q5 + sel_bias_q5, " ; SDO_q5 = ", SDO_q5, "\n")
## Check: ATT_q5 + sel_bias_q5 = 20083.76 ; SDO_q5 = 20083.76
Short summary (Q5)
female_lalonde.csv
datasetfile_path <- "/Users/katiankurunzizagwaneza/Downloads/female_lalonde.csv"
if (file.exists(file_path)) {
lal <- read.csv(file_path)
cat("\n First few rows of data: \n")
head(lal)
} else {
message("File not found at: ", file_path, " — please set `file_path` to the correct location.")
}
##
## First few rows of data:
## treated age educ nodegree married black hisp moa re75 re76 re77
## 1 99 46 12 0 0 0 0 NA 11277.16 3183.434 16695.47
## 2 99 29 12 0 0 1 0 NA 21480.29 23706.422 22578.64
## 3 99 40 11 1 0 1 0 NA 0.00 0.000 0.00
## 4 99 37 12 0 0 1 0 NA 0.00 0.000 0.00
## 5 99 31 12 0 0 1 0 NA 10795.64 15578.506 13594.89
## 6 99 42 9 1 0 1 0 NA 0.00 0.000 0.00
## re78 re79 re74 afdc75 nchildren75 zero74 zero75 redif lalonde
## 1 16953.62 17922.08 16217.85 0 0 0 0 6644.9248 NA
## 2 20687.76 21249.48 23252.09 0 0 0 0 -230.8145 NA
## 3 0.00 0.00 0.00 1 5 1 1 0.0000 NA
## 4 0.00 0.00 0.00 1 2 1 1 0.0000 NA
## 5 14481.43 13129.24 11919.14 0 0 0 0 2333.6035 NA
## 6 0.00 0.00 0.00 1 4 1 1 0.0000 NA
## psid1 psid2 sample_lalonde sample_dw sample_early
## 1 1 NA 0 0 0
## 2 1 NA 0 0 0
## 3 1 1 0 0 0
## 4 1 1 0 0 0
## 5 1 NA 0 0 0
## 6 1 1 0 0 0
## Separate data into experimental and observational groups
rct_0 <- lal[lal$treated == 0, ] # RCT control group
rct_1 <- lal[lal$treated == 1, ] # RCT treatment group
psid <- lal[lal$treated == 99, ] # PSID observational comparison group
# 1. Regression to estimate treatment coefficients and CI interval for RCT units
rct_data <- lal[lal$treated != 99, ] # RCT data
regression_rct <- lm(rct_data$re79 ~ rct_data$age + rct_data$educ + rct_data$nodegree + rct_data$married + rct_data$black + rct_data$hisp + rct_data$re75 + rct_data$re74 + rct_data$nchildren75 + rct_data$zero74 + rct_data$zero75 + rct_data$treated, data = rct_data)
summary(regression_rct)
##
## Call:
## lm(formula = rct_data$re79 ~ rct_data$age + rct_data$educ + rct_data$nodegree +
## rct_data$married + rct_data$black + rct_data$hisp + rct_data$re75 +
## rct_data$re74 + rct_data$nchildren75 + rct_data$zero74 +
## rct_data$zero75 + rct_data$treated, data = rct_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7666 -4056 -2724 4199 24941
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1972.33872 1807.45761 1.091 0.27540
## rct_data$age -0.82507 22.06427 -0.037 0.97018
## rct_data$educ 182.33368 105.36726 1.730 0.08381 .
## rct_data$nodegree -762.31495 437.77162 -1.741 0.08188 .
## rct_data$married 870.16551 898.83039 0.968 0.33319
## rct_data$black 229.61323 709.20579 0.324 0.74618
## rct_data$hisp 908.58533 829.37418 1.096 0.27352
## rct_data$re75 0.16021 0.31893 0.502 0.61552
## rct_data$re74 -0.02356 0.30040 -0.078 0.93751
## rct_data$nchildren75 99.17803 119.36985 0.831 0.40623
## rct_data$zero74 -907.72359 1292.61438 -0.702 0.48267
## rct_data$zero75 741.29602 1216.25319 0.609 0.54232
## rct_data$treated 868.89227 307.41041 2.826 0.00479 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5276 on 1173 degrees of freedom
## (416 observations deleted due to missingness)
## Multiple R-squared: 0.02661, Adjusted R-squared: 0.01665
## F-statistic: 2.672 on 12 and 1173 DF, p-value: 0.001503
confint(regression_rct, "rct_data$treated")
## 2.5 % 97.5 %
## rct_data$treated 265.7566 1472.028
We dropped about 416 observations due to missing data on running the lm regression. That might have led to a wider confidence interval.
# 2. Create an observational dataset by deleting RCT controls and relabeling PSID units (treat=99) as 0.
obs_data <- rbind(rct_1, psid)
obs_data$treated[obs_data$treated == 99] <- 0
table(obs_data$treated)
# 3. Perform genetic matching (using the Matching library) with: pop.size = 100, wait.generations = 10, max.generations = 100
install.packages("Matching")
install.packages("rgenoud")
library(Matching)
library(rgenoud)
#Remove rows with missing data
# How many rows total
nrow(obs_data)
# columns to require non-missing
required <- c("treated","re79",
"age", "educ", "nodegree", "married", "black", "hisp",
"re75", "re74", "nchildren75", "zero74", "zero75")
initial_n <- nrow(obs_data)
obs_data <- obs_data[complete.cases(obs_data[, required]), ]
print(table(obs_data$treated))
cat("Dropped", initial_n - nrow(obs_data), "rows; remaining:", nrow(obs_data), "\n")
# Outcome and treatment
Y <- obs_data$re79
Tr <- obs_data$treated
# Covariate matrix (base R specification)
X <- as.matrix(obs_data[, c(
"age", "educ", "nodegree", "married", "black", "hisp",
"re75", "re74", "nchildren75", "zero74", "zero75"
)])
# Genetic search for optimal weights
genout <- GenMatch( Tr = Tr, X = X, pop.size = 100, wait.generations = 10, max.generations = 100)
# Matching using optimal weights
mout <- Match( Y = Y, Tr = Tr, X = X, Weight.matrix = genout)
summary(mout)
# 4. Report balance results and lowest p-value.
# --- MatchBalance: bootstrap balance check (safe to missing covariates) ---
bal <- MatchBalance(
Tr ~ age + educ + nodegree + married + black + hisp +
re75 + re74 + nchildren75 + zero74 + zero75,
match.out = mout,
nboots = 500
)
print(bal)
# --- Balance checks and treatment-effect estimation ---
covars <- c("age", "educ", "nodegree", "married", "black", "hisp",
"re75", "re74", "nchildren75", "zero74", "zero75")
# P-values before matching
before_pvals <- sapply(covars, function(v) {
t.test(obs_data[[v]] ~ obs_data$treated)$p.value
})
names(before_pvals) <- covars
# P-values after matching (compare matched treated vs matched controls)
matched_t <- obs_data[mout$index.treated, ]
matched_c <- obs_data[mout$index.control, ]
after_pvals <- sapply(covars, function(v) {
t.test(matched_t[[v]], matched_c[[v]])$p.value
})
names(after_pvals) <- covars
cat("Lowest p-value after matching:", format(min(after_pvals), digits = 4), "\n")
cat("P-values after matching:\n")
print(after_pvals)
# 5. Estimate treatment effect, standard error, and (bonus) confidence interval.
# ATT from matched pairs (direct paired differences)
pair_diffs <- Y[mout$index.treated] - Y[mout$index.control]
n_pairs <- length(pair_diffs)
att_est <- mean(pair_diffs)
att_se <- sd(pair_diffs) / sqrt(n_pairs)
ci_mult <- qt(0.975, df = n_pairs - 1)
ci_lower <- att_est - ci_mult * att_se
ci_upper <- att_est + ci_mult * att_se
cat(sprintf("Estimated ATT = %.4f; SE = %.4f; 95%% CI = [%.4f, %.4f]\n",
att_est, att_se, ci_lower, ci_upper))
# Also show Match() reported estimate/se if present
if (!is.null(mout$est)) cat(sprintf("Match() reported est = %.4f\n", mout$est))
if (!is.null(mout$se)) cat(sprintf("Match() reported se = %.4f\n", mout$se))
# 6. Using the quantreg library, estimate and plot quantile treatment effects with CIs.
matched_treated <- obs_data[mout$index.treated, ]
matched_control <- obs_data[mout$index.control, ]
matched_data <- rbind(
transform(matched_treated, matched_pair = 1:length(mout$index.treated)),
transform(matched_control, matched_pair = 1:length(mout$index.control))
)
###############################################
# Quantile Treatment Effects (quantreg version)
###############################################
library(quantreg)
# Construct matched dataset for rq
matched_treated <- obs_data[mout$index.treated, ]
matched_control <- obs_data[mout$index.control, ]
matched_treated$treated <- 1
matched_control$treated <- 0
matched_data <- rbind(matched_treated, matched_control)
# Quantiles to analyze
taus <- seq(0.1, 0.9, by = 0.1)
# Storage
qte_est <- numeric(length(taus))
qte_low <- numeric(length(taus))
qte_high <- numeric(length(taus))
for (i in seq_along(taus)) {
tau <- taus[i]
rq_fit <- rq(re79 ~ treated, tau = tau, data = matched_data)
# Bootstrapped SEs for CI
rq_sum <- summary(rq_fit, se = "boot", R = 500)
est <- rq_sum$coefficients["treated", "Value"]
se <- rq_sum$coefficients["treated", "Std. Error"]
# 95% CI
qte_est[i] <- est
qte_low[i] <- est - 1.96 * se
qte_high[i] <- est + 1.96 * se
}
# --------------------------
# Plot QTEs with 95% CIs
# --------------------------
plot(
taus, qte_est, type = "b", pch = 19,
ylim = range(qte_low, qte_high),
xlab = "Quantile (tau)", ylab = "Quantile Treatment Effect",
main = "Quantile Treatment Effects with 95% CI (Matched Sample)"
)
# CI bands
segments(taus, qte_low, taus, qte_high, lwd = 2)
abline(h = 0, lty = 2)
library(tidyverse)
library(haven)
library(Synth)
library(devtools)
library(SCtools)
read_data <- function(df)
{
full_path <- paste("https://github.com/scunning1975/mixtape/raw/master/",
df, sep = "")
df <- read_dta(full_path)
return(df)
}
texas <- read_data("texas.dta") %>%
as.data.frame(.)
dataprep_out <- dataprep(
foo = texas,
predictors = c("poverty", "income"),
predictors.op = "mean",
time.predictors.prior = 1985:1993,
special.predictors = list(
list("bmprison", c(1988, 1990:1992), "mean"),
list("alcohol", 1990, "mean"),
list("aidscapita", 1990:1991, "mean"),
list("black", 1990:1992, "mean"),
list("perc1519", 1990, "mean")),
dependent = "bmprison",
unit.variable = "statefip",
unit.names.variable = "state",
time.variable = "year",
treatment.identifier = 48,
controls.identifier = c(1,2,4:6,8:13,15:42,44:47,49:51,53:56),
time.optimize.ssr = 1985:1993,
time.plot = 1985:2000
)
synth_out <- synth(data.prep.obj = dataprep_out)
path.plot(synth_out, dataprep_out)
gaps.plot(synth_out, dataprep_out)
placebos <- generate.placebos(dataprep_out, synth_out, Sigf.ipop = 3)
plot_placebos(placebos)
mspe.plot(placebos, discard.extreme = TRUE, mspe.limit = 1, plot.hist = TRUE)
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.0
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Argentina/Buenos_Aires
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.37 R6_2.6.1 fastmap_1.2.0 tidyselect_1.2.1
## [5] xfun_0.53 magrittr_2.0.3 glue_1.8.0 cachem_1.1.0
## [9] tibble_3.3.0 knitr_1.50 pkgconfig_2.0.3 htmltools_0.5.8.1
## [13] generics_0.1.4 rmarkdown_2.30 lifecycle_1.0.4 cli_3.6.5
## [17] vctrs_0.6.5 sass_0.4.10 jquerylib_0.1.4 compiler_4.5.1
## [21] rstudioapi_0.17.1 tools_4.5.1 pillar_1.11.0 evaluate_1.0.5
## [25] bslib_0.9.0 rlang_1.1.6 jsonlite_2.0.0