DID Machine Learning Approaches

Author

Dor Leventer

Published

March 8, 2023

This document is mainly code oriented, and its purpose is to code from scratch, how to estimate a residualized DID regression.

Setup

# general setup
rm(list = ls()) # del all objects and functions
gc() #cleans memory
options(scipen = 999) # tell R not to use Scientific notation
options(digits = 5) # controls how many digits are printed by default

# libraries
pac.vec <-
  c(
    "did",
    "broom",
    "data.table",
    "tidyverse",
    "grf",
    "patchwork"
  )
lapply(pac.vec, require, character.only = T)

# setup for ggplot graph layout
theme_set(theme_bw(base_size = 16))
theme_update(
  # change axis text size
  axis.text = element_text(size = 14),
  axis.title = element_text(size = 16),
  # change title alignment and size
  plot.title = element_text(hjust = 0.5, size = 20),
  plot.subtitle = element_text(hjust = 0.5, size = 18),
  # what lines in background of plot
  panel.grid.major.y = element_line(color = "gray"),
  panel.grid.major.x = element_line(color = "gray"),
  # legend text size and outline
  legend.text = element_text(size = 14),
  legend.background = element_rect(
    fill = "white",
    colour = "black",
    linewidth = 0.2,
    linetype = "solid"
  ),
  # facet title size
  strip.text.x = element_text(size = 16),
  strip.text.y = element_text(size = 16)
)

## create custom theme for ggplot
theme_nice <- function() {
    theme(
      panel.grid.minor = element_blank(),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5),
      plot.subtitle = element_text(hjust = 0.5),
    )
}

# setup for graph colors
orange <- "#f46d43"
lightorange <- "#fdae61"
blue <- "#0072B2"
lightblue <- "#74add1"
  
# setup for graph saving for slides
height = 4
width = 8

# parameters
set.seed(1)

the dgp

make_data <- function(nobs = 1000,
                      years = c(1, 10), 
                      time_treat = 5, 
                      rho = 0.2, 
                      seed = 1) {
  set.seed(seed)
  # create panel data
  dat <- CJ(id = 1:nobs, time = years[1]:years[2])
  # add constant Xs
  X = gendata::genmvnorm(cor = c(rho, rho, rho, rho, rho, rho),
                         k = 4,
                         n = nobs) %>% as.data.table()
  X[, id := 1:nobs]
  dat = dat[X, on = "id"]
  # treatment propensity and treatment group
  dat[, logit_error := rnorm(1), by = "id"][
    , treat_prop := (1/(1+exp(1)^(-(-0.5 + (X3 > 0) + 0.5*X3*X4 + logit_error))))][
    , treat := 1*(treat_prop > .5)][
    , cohort := fifelse(treat == 1, time_treat, 99)]
  # did variables
  dat[, time_fe := rnorm(n=1, mean = (time)/5, sd = 1), by = "time"][
      , unit_fe := rnorm(1, mean = (treat), sd = 1), by = "id"]
  # treatment effect and outcome
  dat[, tau := fifelse(time >= cohort & treat == 1, 1*(X1 > 0) + 1*(X2 > 0), 0)][
    , cumtau := cumsum(tau), by = "id"][
      , error  := rnorm(.N, 0, .5)][
        , Y0 := unit_fe + time_fe + 0.2*(X4 > 0) + 0.1*X3^2 + 0.1*X2*X1][
          , Y := Y0 + cumtau + error]
  return(dat)
}

look at the data

data = make_data(seed = 1) |> tibble()
data |> filter(time == 1) |> count(treat)
# A tibble: 2 × 2
  treat     n
  <dbl> <int>
1     0   491
2     1   509
data = make_data(seed = 1) |> tibble()

data <- data |> 
  group_by(cohort, time) |> 
  summarise(Y_mean = mean(Y), .groups = "keep") |> 
  right_join(data, by = c("cohort", "time"))

data |> 
  ggplot() + 
  geom_line(data = data |> filter(treat == 1), 
            mapping = aes(x = time, y = Y, group = id), 
            color = "#FC9272", alpha = .2, linewidth = .1) +
  geom_line(data = data |> filter(treat == 0), 
            mapping = aes(x = time, y = Y, group = id), 
            color = "#9ECAE1", alpha = .2, linewidth = .1) +
  geom_line(data = data |> filter(treat == 1), 
            mapping = aes(x = time, y = Y_mean,  color = "Treated"), 
            alpha = .8, linewidth = 2) + 
  geom_line(data = data |> filter(treat == 0), 
            mapping = aes(x = time, y = Y_mean, color = "Control"), 
            alpha = .8, linewidth = 2) + 
  theme_nice() +
  labs(x = "Time Period") +
  geom_vline(xintercept = 4.5, linetype = "dashed") +
  scale_x_continuous(breaks = seq(1, 10, 1)) + 
  scale_color_manual(name = NULL, breaks = c("Treated", "Control"), values = c("#CB181D", "#2171B5"))

ggsave(
    filename = glue::glue("grf_desc.pdf"),
    width = 8,
    height = 4
  )

GRF and DID

splitting a tree

# functions ---------------------------------------------------------------

split_function <- function(df, split_value, split_column = 1) {
  split_indexes <- df[,split_column] >= split_value
  data_1 <- df[split_indexes,]
  data_2 <- df[!split_indexes,]
  
  theta_1 <- lm(Y ~ W, data = data_1)$coefficients[2]
  theta_2 <- lm(Y ~ W, data = data_2)$coefficients[2]
  
  weight = nrow(data_1)*nrow(data_2)/(nrow(df)^2)
  
  delta_c <- weight*(theta_1 - theta_2)*2
  
  return(round(delta_c, digits = 2))
}

# graphs ------------------------------------------------------------------

data <- make_data()
# some local parameters
cohort_test = 5; cohort_train = 99; year_pre = 4; year_post = 5;

# some data configuration
data <- data |> 
  filter(time %in% c(year_pre, year_post)) |> 
  mutate(W = treat) |> 
  tibble() |> 
  mutate(W_char = ifelse(W == 1, "Treatment", "Control"),
         time_char = ifelse(time == year_post, "After", "Before"))

# some needed variables
Y_treat_post = data |> filter(time == year_post) |> pull(Y)
Y_treat_pre = data |> filter(time == year_pre) |> pull(Y)
delta_Y = Y_treat_post - Y_treat_pre
W = data |> filter(time == year_post) |> pull(W)
X <- data |> filter(time == year_post) |> select(X1:X4) |> as.matrix()

df <- data.frame(X, W, Y = delta_Y) |> 
  mutate(W_char = ifelse(W == 1, "Treatment", "Control"))

data |>
  ggplot() +
  geom_point(aes(x = X1, y = X2, color = Y), stroke = .8, alpha = .8) +
  labs(subtitle = "The covariate space") +
  theme_nice() + 
  facet_grid(cols = vars(W_char), rows = vars(time_char)) +
  scale_color_viridis_b(name = "Y")

# df |>
#   ggplot() +
#   geom_point(aes(x = X1, y = X2, color = Y), stroke = .8, alpha = .8) +
#   labs(subtitle = "The covariate space") +
#   theme_nice() + 
#   facet_grid(cols = vars(W_char), rows = vars(time_char)) +
#   scale_color_viridis_b(name = "Delta Y")

ggsave(
  filename = "causal_trees_example_1.pdf",
  width = 8, 
  height = 4
)
for (split_value in-1:1) {
  data |>
    ggplot() +
    geom_point(aes(x = X1, y = X2, color = Y), stroke = .8, alpha = .8) +
    labs(subtitle = "The covariate space") +
    theme_nice() + 
    facet_grid(cols = vars(W_char), rows = vars(time_char)) +
    scale_color_viridis_b(name = "Y") +
    geom_vline(xintercept = split_value,color = "black",linewidth = 1,linetype = "dashed") +
    labs(subtitle = glue::glue("OLS DID Delta(C_1, C_2) = {split_function(df, split_value)}")) 

  ggsave(
    filename = glue::glue("causal_trees_example_{split_value+3}.pdf"),
    width = 8,
    height = 4
  )
}
cf <- grf::causal_forest(X = X, W = W, Y = delta_Y)
temp <- cf$predictions |> cbind(X, W)
temp |> as.data.frame() |> rename(tau = V1) |> 
  filter(W == 1) |> 
  ggplot(aes(x = X1, y = X2, color = tau)) + 
  geom_point(stroke = .8, alpha = .8) +
  theme_nice() +
  labs(subtitle = "GRF CATT estimates") +
  scale_color_viridis_b(name = "Tau hat")

ggsave(
    filename = glue::glue("het_grf.pdf"),
    width = 8,
    height = 4
  )