# general setup
rm(list = ls()) # del all objects and functions
gc() #cleans memory
options(scipen = 999) # tell R not to use Scientific notation
options(digits = 5) # controls how many digits are printed by default
# libraries
pac.vec <-
c(
"did",
"broom",
"data.table",
"tidyverse"
)
lapply(pac.vec, require, character.only = T)
# setup for ggplot graph layout
theme_set(theme_bw())
theme_update(
# change axis text size
axis.text = element_text(size = 14),
axis.title = element_text(size = 16),
# change title alignment and size
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 18),
# what lines in background of plot
panel.grid.major.y = element_line(color = "gray"),
panel.grid.major.x = element_line(color = "gray"),
# legend text size and outline
legend.text = element_text(size = 14),
legend.background = element_rect(
fill = "white",
colour = "black",
linewidth = 0.2,
linetype = "solid"
),
# facet title size
strip.text.x = element_text(size = 16),
strip.text.y = element_text(size = 16)
)
# setup for graph colors
orange <- "#f46d43"
lightorange <- "#fdae61"
blue <- "#0072B2"
lightblue <- "#74add1"
# setup for graph saving for slides
height = 4
width = 8
# parameters
set.seed(1)Going over Callaway and Sant’Anna Double Robust Estimator
This document is mainly code oriented, and its purpose is to code from scratch, the double robust DID estimator proposed by Callaway and Sant’Anna in their paper: “Difference-in-Differences with multiple time periods”.
Setup
the dgp
make_data <- function(nobs = 1000,
nstates = 30,
state_treat = c(1, 11, 21),
years = c(1980, 2010),
cohort_treat = c(1990, 2000, 9999),
treatments = c(1, 1, 1),
rho = 0) {
dat = CJ(firm = 1:nobs, year = years[1]:years[2])[
, state := sample(1:nstates, 1), by = "firm"][
, year_fe := rnorm(n=1, mean = (year-1980)/10, sd = 1), by = "year"][
, unit_fe := rnorm(1, mean = (31 - state), sd = 1), by = "firm"]
setkey(dat, state, firm, year)
treatment_groups = data.table(state = state_treat,
cohort = cohort_treat,
hat_gamma = treatments)
dat = treatment_groups[dat, roll = TRUE, on = "state"]
# create 3 covariates, with correlation rho between the first X1 and X2, X1 and X3, and zero between X2 and X3
X = gendata::genmvnorm(cor = c(rho, rho, 0),
k = 3,
n = nobs) %>% as.data.table()
X[, firm := 1:nobs]
dat = dat[X, on = "firm"]
dat[, unit_fe := ifelse(cohort == 1990,
runif(1, min = 2, max = 3),
runif(1, min = 1, max = 2)), by = "firm"][
, unit_fe := ifelse(cohort == 9999,runif(1, min = 0, max = 1),unit_fe), by = "firm"][
, treat := as.numeric(year >= cohort)][, gamma := rnorm(.N, mean = hat_gamma, sd = .2)][
, tau := fifelse(treat == 1, 1*(X1 > 0), 0)][
, cumtau := cumsum(tau), by = "firm"][
, error := rnorm(.N, 0, .5)][
, Y0 := unit_fe + year_fe + 0.2*(X1 > 0) + 0.1*X2^2 + 0.1*X1*X3][
, Y := Y0 + cumtau + error][
, time_to_treat := year - cohort]
return(dat)
}look at the data
data = make_data() |> tibble()
data <- data |>
group_by(cohort, year) |>
summarise(Y_mean = mean(Y), .groups = "keep") |>
right_join(data, by = c("cohort", "year"))
data |>
ggplot() +
geom_line(aes(x = year, y = Y, group = firm), color = "gray") +
geom_line(aes(x = year, y = Y_mean, color = as.factor(cohort))) +
facet_grid(cols = vars(cohort))the dr learner
a target estimand
data <- make_data()
# the att(1990, 1991) is equal 1
data |> filter(cohort == 1990 & year == 1991) |> pull(cumtau) |> mean()[1] 1.076
the known did estimators
mean_Y <- function(data, cohort_train, year_train) data |> filter(cohort == cohort_train & year == year_train) |> pull(Y) |> mean()
coef_did <- mean_Y(data, 1990, 1991) - mean_Y(data, 1990, 1989) -
(mean_Y(data, 9999, 1991) - mean_Y(data, 9999, 1989))
did_cs <- did::att_gt(yname = "Y", tname = "year", idname = "firm", gname = "cohort", xformla = ~X1 + X2 + X3, control_group = "nevertreated", data = data)
coef_cs <- did_cs |> broom::tidy() |> filter(group == 1990, time == 1991) |> pull(estimate)looks okay
lets calculate some outcome functions
So the moment equation here is
\[\begin{align*} & Y_{it}-Y_{i\delta_{G_{i}}}-\left[\mu_{\infty,t}\left(X_{i}\right)-\mu_{\infty,\delta_{G_{i}}}\left(X_{i}\right)\right]=\tau_{it}+\varepsilon_{it}\\ \text{Denote } & Y^{*}=Y_{it}-Y_{i\delta_{G_{i}}}-\left[\mu_{\infty,t}\left(X_{i}\right)-\mu_{\infty,\delta_{G_{i}}}\left(X_{i}\right)\right]\\ \widehat{\tau} & =\arg\min_{\theta}\left\{ E\left[\left(Y^{*}-\theta\right)^{2}\right]\right\} \end{align*}\]
data <- make_data()
# train a model for the conditional expectation of never treated outcomes
xgb_mod <- xgboost::xgboost(data = data |> filter(cohort == 9999) |> select(X1, X2, X3, year) |> as.matrix(),
label = data |> filter(cohort == 9999) |> pull(Y),
nrounds = 500, verbose = F)
# predict these for the treated cohort
mu_control_pred <- function(cohort_test, year_test) {
test_data <- data |> filter(cohort == cohort_test, year == year_test) |> select(X1,X2,X3,year) |> as.matrix()
return(predict(xgb_mod, test_data))
}
Y_treat_post = data |> filter(cohort == 1990, year == 1991) |> pull(Y)
Y_treat_pre = data |> filter(cohort == 1990, year == 1989) |> pull(Y)
Y_star = Y_treat_post - Y_treat_pre - (mu_control_pred(1990, 1991) - mu_control_pred(1990, 1989))
coef_db <- lm(Y_star ~ 1)$coefficients[1]lets calculate some propensity scores
step 1
Sketch for (for some variable \(Z\))
\[\mathbb{E}\left[Z\mid G_{i}=g\right]=(\mathbb{E}[G_{ig}])^{-1}\times\mathbb{E}[G_{ig}\times Z]\]
Average \(Z\) for \(G_{i}=g\): \(N_{g}^{-1}\sum_{i:G_{i}=g}Z_{i}\)
Average \(Z\) for all \(i\): \(N^{-1}\sum_{i}Z_{i}\)
Whats the relation?
\[\begin{align*} & \dfrac{1}{N}\sum_{i}Z_{i}\times\dfrac{1}{P(G_{i}=g)}\\ & =\dfrac{1}{P(G_{i}=g)\times N}\sum_{i}Z_{i}=\dfrac{1}{N_{g}}\sum_{i}Z_{i}\\ & \dfrac{1}{N_{g}}\sum_{i}\left(Z_{i}\times1\left(G_{i}=g\right)\right)\\ & =\dfrac{1}{N_{g}}\sum_{i:G_{i}=g}Z_{i} \end{align*}\]
So
\[\dfrac{1}{N_{g}}\sum_{i:G_{i}=g}Z_{i}=\dfrac{1}{N\times P(G_{i}=g)}\sum_{i}Z_{i}G_{ig}\]
step 2
Consider \[\mathbb{E}_{X}\left\{ \mathbb{E}\left[Y_{t}-Y_{g-1}\mid X,C_{i}=1\right]\mid G_{ig}=1\right\}\]
Due to above, \[\mathbb{E}_{X}\{\mathbb{E}\left[Y_{t}-Y_{g-1}\mid X,C_{i}=1\right]\mid G_{ig}=1\}=(\mathbb{E}[G_{ig}])^{-1}\mathbb{E}\left[G_{ig}(Y_{t}-Y_{g-1})\mid X,C_{i}=1\right]\]
step 3
Define the dummy variable \(1\left(G_{i}=g\right)=G_{i,g}\)
Define the generalized propensity score
\[p_{g,s}\left(X\right)=P\left(G_{i,g}\mid X,G_{i,g}+\left(1-W_{is}\right)\left(1-G_{ig}\right)=1\right)\]
which is the probability of being treated at \(g\) conditional on covariates \(X\) for the group treated at time \(g\) and the not yet treated group at time \(s\). It holds by definition that \[p_{g\infty}(X)=\frac{\mathbb{E}[G_{ig}\mid X]}{\mathbb{E}[G_{ig}+C_{i}\mid X]}\]
One can show that \[\dfrac{\mathbb{E}\left[G_{ig}(Y_{t}-Y_{g-1})\mid X,C_{i}=1\right]}{\mathbb{E}[G_{ig}]}=\dfrac{\mathbb{E}\left[\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}(Y_{t}-Y_{g-1})\right]}{\mathbb{E}\left[\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}\right]}\]
step 4
\[\begin{align} ATT(g,t) & =\mathbb{E}\left[Y_{it}(g)-Y_{it}(\infty)\mid G_{ig}=1\right]\nonumber\\ & =\mathbb{E}\left\{E\left[Y_{it}-Y_{i,g-1}\mid X,G_{ig}=1\right]-E\left[Y_{it}-Y_{i,g-1}\mid X,C_{i}=1\right]\mid G_{ig}=1\right\}\nonumber\\ & =\mathbb{E}\left[\dfrac{G_{ig}\left(Y_{it}-Y_{i,g-1}\right)}{\mathbb{E}\left[G_{ig}\right]}-\dfrac{\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}(Y_{t}-Y_{g-1})}{\mathbb{E}\left[\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}\right]}\right]\nonumber\\ & =\mathbb{E}\left[\left(\dfrac{G_{ig}}{\mathbb{E}\left[G_{ig}\right]}-\dfrac{\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}}{\mathbb{E}\left[\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}\right]}\right)\left(Y_{it}-Y_{i,g-1}\right)\right] \end{align}\]
data <- make_data()
data <- data |>
filter(cohort %in% c(1990, 9999)) |>
mutate(g_i1990 = 1*(cohort == 1990),
c_i = 1*(cohort == 9999))
xgb_mod <- xgboost::xgboost(data = data |> filter(year == 1990) |> select(X1, X2, X3) |> as.matrix(),
label = data |> filter(year == 1990) |> pull(g_i1990),
nrounds = 500, verbose = F)
p_hat <- xgb_mod |> predict(data |> filter(year == 1989) |> select(X1, X2, X3) |> as.matrix())
g_i1990 <- data |> filter(year == 1981) |> pull(g_i1990)
E_g_i1990 <- mean(g_i1990)
c_i <- data |> filter(year == 1981) |> pull(c_i)
ip <- (p_hat * c_i) / (1 - p_hat)
E_ip <- mean(ip)
aipw <- g_i1990/E_g_i1990 - ip / E_ip
Y_treat_post = data |> filter(year == 1991) |> pull(Y)
Y_treat_pre = data |> filter(year == 1989) |> pull(Y)
Y_star = aipw * (Y_treat_post - Y_treat_pre)
coef_aipw <- lm(Y_star ~ 1)$coefficients[1]lets do it double robust
data <- make_data()
# conditional mean outcome for the controls
mu_control_pred <- function(data, cohort_test, year_test, cohort_train) {
xgb_mod <- xgboost::xgboost(data = data |> filter(cohort == cohort_train) |> select(X1, X2, X3, year) |> as.matrix(),
label = data |> filter(cohort == cohort_train) |> pull(Y),
nrounds = 500, verbose = F)
test_data <- data |> filter(cohort %in% c(cohort_test, cohort_train), year == year_test) |> select(X1,X2,X3,year) |> as.matrix()
return(predict(xgb_mod, test_data))
}
# aipw
propensity_pred <- function(data, cohort_test, cohort_train) {
data <- data |>
filter(cohort %in% c(cohort_test, cohort_train)) |>
mutate(g_i1990 = 1*(cohort == cohort_test),
c_i = 1*(cohort == cohort_train))
xgb_mod <- xgboost::xgboost(data = data |> filter(cohort %in% c(cohort_test, cohort_train), year == cohort_test) |> select(X1, X2, X3) |> as.matrix(),
label = data |> filter(cohort %in% c(cohort_test, cohort_train),year == cohort_test) |> pull(g_i1990),
nrounds = 500, verbose = F)
p_hat <- xgb_mod |> predict(data |> filter(year == cohort_test) |> select(X1, X2, X3) |> as.matrix())
g_i1990 <- data |> filter(year == cohort_test) |> pull(g_i1990)
E_g_i1990 <- mean(g_i1990)
c_i <- data |> filter(year == cohort_test) |> pull(c_i)
ip <- (p_hat * c_i) / (1 - p_hat)
E_ip <- mean(ip)
aipw <- g_i1990 / E_g_i1990 - ip / E_ip
return(aipw)
}
Y_treat_post = data |> filter(cohort %in% c(1990, 9999), year == 1991) |> pull(Y)
Y_treat_pre = data |> filter(cohort %in% c(1990, 9999), year == 1989) |> pull(Y)
aipw <- propensity_pred(data, 1990, 9999)
mu_control <- mu_control_pred(data, 1990, 1991, 9999) - mu_control_pred(data, 1990, 1989, 9999)
Y_star = aipw * (Y_treat_post - Y_treat_pre - mu_control)
coef_dr <- lm(Y_star ~ 1)$coefficients[1]lets compare
tibble(coef_did, coef_cs, coef_db, coef_aipw, coef_dr)# A tibble: 1 × 5
coef_did coef_cs coef_db coef_aipw coef_dr
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1.12 1.13 0.921 0.935 1.03