Going over Callaway and Sant’Anna Double Robust Estimator

Author

Dor Leventer

Published

March 7, 2023

This document is mainly code oriented, and its purpose is to code from scratch, the double robust DID estimator proposed by Callaway and Sant’Anna in their paper: “Difference-in-Differences with multiple time periods”.

Setup

# general setup
rm(list = ls()) # del all objects and functions
gc() #cleans memory
options(scipen = 999) # tell R not to use Scientific notation
options(digits = 5) # controls how many digits are printed by default

# libraries
pac.vec <-
  c(
    "did",
    "broom",
    "data.table",
    "tidyverse"
  )
lapply(pac.vec, require, character.only = T)

# setup for ggplot graph layout
theme_set(theme_bw())
theme_update(
  # change axis text size
  axis.text = element_text(size = 14),
  axis.title = element_text(size = 16),
  # change title alignment and size
  plot.title = element_text(hjust = 0.5, size = 20),
  plot.subtitle = element_text(hjust = 0.5, size = 18),
  # what lines in background of plot
  panel.grid.major.y = element_line(color = "gray"),
  panel.grid.major.x = element_line(color = "gray"),
  # legend text size and outline
  legend.text = element_text(size = 14),
  legend.background = element_rect(
    fill = "white",
    colour = "black",
    linewidth = 0.2,
    linetype = "solid"
  ),
  # facet title size
  strip.text.x = element_text(size = 16),
  strip.text.y = element_text(size = 16)
)

# setup for graph colors
orange <- "#f46d43"
lightorange <- "#fdae61"
blue <- "#0072B2"
lightblue <- "#74add1"
  
# setup for graph saving for slides
height = 4
width = 8

# parameters
set.seed(1)

the dgp

make_data <- function(nobs = 1000,
                      nstates = 30,
                      state_treat = c(1, 11, 21),
                      years = c(1980, 2010),
                      cohort_treat = c(1990, 2000, 9999),
                      treatments = c(1, 1, 1),
                      rho = 0) {
  dat = CJ(firm = 1:nobs, year = years[1]:years[2])[
    , state := sample(1:nstates, 1), by = "firm"][
    , year_fe := rnorm(n=1, mean = (year-1980)/10, sd = 1), by = "year"][
    , unit_fe := rnorm(1, mean = (31 - state), sd = 1), by = "firm"]
  
  setkey(dat, state, firm, year)
  
  treatment_groups = data.table(state = state_treat,
                                cohort = cohort_treat,
                                hat_gamma = treatments)
  dat = treatment_groups[dat, roll = TRUE, on = "state"]
  
  # create 3 covariates, with correlation rho between the first X1 and X2, X1 and X3, and zero between X2 and X3
  X = gendata::genmvnorm(cor = c(rho, rho, 0),
                         k = 3,
                         n = nobs) %>% as.data.table()
  X[, firm := 1:nobs]
  dat = dat[X, on = "firm"]
  
  dat[, unit_fe := ifelse(cohort == 1990,
                          runif(1, min = 2, max = 3),
                          runif(1, min = 1, max = 2)), by = "firm"][
      , unit_fe := ifelse(cohort == 9999,runif(1, min = 0, max = 1),unit_fe), by = "firm"][
      , treat  := as.numeric(year >= cohort)][, gamma  := rnorm(.N, mean = hat_gamma, sd = .2)][
      , tau    := fifelse(treat == 1, 1*(X1 > 0), 0)][
      , cumtau := cumsum(tau), by = "firm"][
      , error  := rnorm(.N, 0, .5)][
      , Y0 := unit_fe + year_fe + 0.2*(X1 > 0) + 0.1*X2^2 + 0.1*X1*X3][
      , Y := Y0 + cumtau + error][
      , time_to_treat := year - cohort]
  
  return(dat)
}

look at the data

data = make_data() |> tibble()
data <- data |> 
  group_by(cohort, year) |> 
  summarise(Y_mean = mean(Y), .groups = "keep") |> 
  right_join(data, by = c("cohort", "year"))

data |> 
  ggplot() + 
  geom_line(aes(x = year, y = Y, group = firm), color = "gray") +
  geom_line(aes(x = year, y = Y_mean, color = as.factor(cohort))) + 
  facet_grid(cols = vars(cohort))

the dr learner

a target estimand

data <- make_data()

# the att(1990, 1991) is equal 1
data |> filter(cohort == 1990 & year == 1991) |> pull(cumtau) |> mean()
[1] 1.076

the known did estimators

mean_Y <- function(data, cohort_train, year_train) data |> filter(cohort == cohort_train & year == year_train) |> pull(Y) |> mean()

coef_did <- mean_Y(data, 1990, 1991) - mean_Y(data, 1990, 1989) -
  (mean_Y(data, 9999, 1991) - mean_Y(data, 9999, 1989))

did_cs <- did::att_gt(yname = "Y", tname = "year", idname = "firm", gname = "cohort", xformla = ~X1 + X2 + X3, control_group = "nevertreated", data = data)
coef_cs <- did_cs |> broom::tidy() |> filter(group == 1990, time == 1991) |> pull(estimate)

looks okay

lets calculate some outcome functions

So the moment equation here is

\[\begin{align*} & Y_{it}-Y_{i\delta_{G_{i}}}-\left[\mu_{\infty,t}\left(X_{i}\right)-\mu_{\infty,\delta_{G_{i}}}\left(X_{i}\right)\right]=\tau_{it}+\varepsilon_{it}\\ \text{Denote } & Y^{*}=Y_{it}-Y_{i\delta_{G_{i}}}-\left[\mu_{\infty,t}\left(X_{i}\right)-\mu_{\infty,\delta_{G_{i}}}\left(X_{i}\right)\right]\\ \widehat{\tau} & =\arg\min_{\theta}\left\{ E\left[\left(Y^{*}-\theta\right)^{2}\right]\right\} \end{align*}\]

data <- make_data()

# train a model for the conditional expectation of never treated outcomes
xgb_mod <- xgboost::xgboost(data = data |> filter(cohort == 9999) |> select(X1, X2, X3, year) |> as.matrix(), 
                 label = data |> filter(cohort == 9999) |> pull(Y), 
                 nrounds = 500, verbose = F)

# predict these for the treated cohort
mu_control_pred <- function(cohort_test, year_test) {
  test_data <- data |> filter(cohort == cohort_test, year == year_test) |> select(X1,X2,X3,year) |> as.matrix()
  return(predict(xgb_mod, test_data))
}

Y_treat_post = data |> filter(cohort == 1990, year == 1991) |> pull(Y)
Y_treat_pre = data |> filter(cohort == 1990, year == 1989) |> pull(Y)
Y_star = Y_treat_post - Y_treat_pre - (mu_control_pred(1990, 1991) - mu_control_pred(1990, 1989))

coef_db <- lm(Y_star ~ 1)$coefficients[1]

lets calculate some propensity scores

step 1

Sketch for (for some variable \(Z\))

\[\mathbb{E}\left[Z\mid G_{i}=g\right]=(\mathbb{E}[G_{ig}])^{-1}\times\mathbb{E}[G_{ig}\times Z]\]

Average \(Z\) for \(G_{i}=g\): \(N_{g}^{-1}\sum_{i:G_{i}=g}Z_{i}\)

Average \(Z\) for all \(i\): \(N^{-1}\sum_{i}Z_{i}\)

Whats the relation?

\[\begin{align*} & \dfrac{1}{N}\sum_{i}Z_{i}\times\dfrac{1}{P(G_{i}=g)}\\ & =\dfrac{1}{P(G_{i}=g)\times N}\sum_{i}Z_{i}=\dfrac{1}{N_{g}}\sum_{i}Z_{i}\\ & \dfrac{1}{N_{g}}\sum_{i}\left(Z_{i}\times1\left(G_{i}=g\right)\right)\\ & =\dfrac{1}{N_{g}}\sum_{i:G_{i}=g}Z_{i} \end{align*}\]

So

\[\dfrac{1}{N_{g}}\sum_{i:G_{i}=g}Z_{i}=\dfrac{1}{N\times P(G_{i}=g)}\sum_{i}Z_{i}G_{ig}\]

step 2

Consider \[\mathbb{E}_{X}\left\{ \mathbb{E}\left[Y_{t}-Y_{g-1}\mid X,C_{i}=1\right]\mid G_{ig}=1\right\}\]

Due to above, \[\mathbb{E}_{X}\{\mathbb{E}\left[Y_{t}-Y_{g-1}\mid X,C_{i}=1\right]\mid G_{ig}=1\}=(\mathbb{E}[G_{ig}])^{-1}\mathbb{E}\left[G_{ig}(Y_{t}-Y_{g-1})\mid X,C_{i}=1\right]\]

step 3

Define the dummy variable \(1\left(G_{i}=g\right)=G_{i,g}\)

Define the generalized propensity score

\[p_{g,s}\left(X\right)=P\left(G_{i,g}\mid X,G_{i,g}+\left(1-W_{is}\right)\left(1-G_{ig}\right)=1\right)\]

which is the probability of being treated at \(g\) conditional on covariates \(X\) for the group treated at time \(g\) and the not yet treated group at time \(s\). It holds by definition that \[p_{g\infty}(X)=\frac{\mathbb{E}[G_{ig}\mid X]}{\mathbb{E}[G_{ig}+C_{i}\mid X]}\]

One can show that \[\dfrac{\mathbb{E}\left[G_{ig}(Y_{t}-Y_{g-1})\mid X,C_{i}=1\right]}{\mathbb{E}[G_{ig}]}=\dfrac{\mathbb{E}\left[\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}(Y_{t}-Y_{g-1})\right]}{\mathbb{E}\left[\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}\right]}\]

step 4

\[\begin{align} ATT(g,t) & =\mathbb{E}\left[Y_{it}(g)-Y_{it}(\infty)\mid G_{ig}=1\right]\nonumber\\ & =\mathbb{E}\left\{E\left[Y_{it}-Y_{i,g-1}\mid X,G_{ig}=1\right]-E\left[Y_{it}-Y_{i,g-1}\mid X,C_{i}=1\right]\mid G_{ig}=1\right\}\nonumber\\ & =\mathbb{E}\left[\dfrac{G_{ig}\left(Y_{it}-Y_{i,g-1}\right)}{\mathbb{E}\left[G_{ig}\right]}-\dfrac{\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}(Y_{t}-Y_{g-1})}{\mathbb{E}\left[\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}\right]}\right]\nonumber\\ & =\mathbb{E}\left[\left(\dfrac{G_{ig}}{\mathbb{E}\left[G_{ig}\right]}-\dfrac{\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}}{\mathbb{E}\left[\dfrac{p_{g,\infty}\left(X\right)\times C_{i}}{\left(1-p_{g,\infty}\left(X\right)\right)}\right]}\right)\left(Y_{it}-Y_{i,g-1}\right)\right] \end{align}\]

data <- make_data()

data <- data |> 
  filter(cohort %in% c(1990, 9999)) |> 
  mutate(g_i1990 = 1*(cohort == 1990),
         c_i = 1*(cohort == 9999))

xgb_mod <- xgboost::xgboost(data = data |> filter(year == 1990) |> select(X1, X2, X3) |> as.matrix(), 
                 label = data |> filter(year == 1990) |> pull(g_i1990), 
                 nrounds = 500, verbose = F)

p_hat <- xgb_mod |> predict(data |> filter(year == 1989) |> select(X1, X2, X3) |> as.matrix())

g_i1990 <- data |> filter(year == 1981) |> pull(g_i1990)
E_g_i1990 <- mean(g_i1990)

c_i <- data |> filter(year == 1981) |> pull(c_i)
ip <- (p_hat * c_i) / (1 - p_hat)
E_ip <- mean(ip)

aipw <- g_i1990/E_g_i1990 - ip / E_ip

Y_treat_post = data |> filter(year == 1991) |> pull(Y)
Y_treat_pre = data |> filter(year == 1989) |> pull(Y)

Y_star = aipw * (Y_treat_post - Y_treat_pre)

coef_aipw <- lm(Y_star ~ 1)$coefficients[1]

lets do it double robust

data <- make_data()

# conditional mean outcome for the controls
mu_control_pred <- function(data, cohort_test, year_test, cohort_train) {
  xgb_mod <- xgboost::xgboost(data = data |> filter(cohort == cohort_train) |> select(X1, X2, X3, year) |> as.matrix(), 
                 label = data |> filter(cohort == cohort_train) |> pull(Y), 
                 nrounds = 500, verbose = F)
  test_data <- data |> filter(cohort %in% c(cohort_test, cohort_train), year == year_test) |> select(X1,X2,X3,year) |> as.matrix()
  return(predict(xgb_mod, test_data))
}

# aipw 
propensity_pred <- function(data, cohort_test, cohort_train) {
  
  data <- data |> 
    filter(cohort %in% c(cohort_test, cohort_train)) |> 
    mutate(g_i1990 = 1*(cohort == cohort_test),
           c_i = 1*(cohort == cohort_train))
  
  xgb_mod <- xgboost::xgboost(data = data |> filter(cohort %in% c(cohort_test, cohort_train), year == cohort_test) |> select(X1, X2, X3) |> as.matrix(), 
                   label = data |> filter(cohort %in% c(cohort_test, cohort_train),year == cohort_test) |> pull(g_i1990), 
                   nrounds = 500, verbose = F)
  
  p_hat <- xgb_mod |> predict(data |> filter(year == cohort_test) |> select(X1, X2, X3) |> as.matrix())
  
  g_i1990 <- data |> filter(year == cohort_test) |> pull(g_i1990)
  E_g_i1990 <- mean(g_i1990)
  
  c_i <- data |> filter(year == cohort_test) |> pull(c_i)
  ip <- (p_hat * c_i) / (1 - p_hat)
  E_ip <- mean(ip)
  
  aipw <- g_i1990 / E_g_i1990 - ip / E_ip
  return(aipw)
}

Y_treat_post = data |> filter(cohort %in% c(1990, 9999), year == 1991) |> pull(Y)
Y_treat_pre = data |> filter(cohort %in% c(1990, 9999), year == 1989) |> pull(Y)

aipw <- propensity_pred(data, 1990, 9999)
mu_control <- mu_control_pred(data, 1990, 1991, 9999) - mu_control_pred(data, 1990, 1989, 9999)

Y_star = aipw * (Y_treat_post - Y_treat_pre - mu_control)

coef_dr <- lm(Y_star ~ 1)$coefficients[1]

lets compare

tibble(coef_did, coef_cs, coef_db, coef_aipw, coef_dr)
# A tibble: 1 × 5
  coef_did coef_cs coef_db coef_aipw coef_dr
     <dbl>   <dbl>   <dbl>     <dbl>   <dbl>
1     1.12    1.13   0.921     0.935    1.03