# general setup
rm(list = ls()) # del all objects and functions
gc() #cleans memory
options(scipen = 999) # tell R not to use Scientific notation
options(digits = 5) # controls how many digits are printed by default
# libraries
pac.vec <-
c(
"did",
"broom",
"data.table",
"tidyverse",
"grf",
"patchwork"
)
lapply(pac.vec, require, character.only = T)
# setup for ggplot graph layout
theme_set(theme_bw(base_size = 16))
theme_update(
# change axis text size
axis.text = element_text(size = 14),
axis.title = element_text(size = 16),
# change title alignment and size
plot.title = element_text(hjust = 0.5, size = 20),
plot.subtitle = element_text(hjust = 0.5, size = 18),
# what lines in background of plot
panel.grid.major.y = element_line(color = "gray"),
panel.grid.major.x = element_line(color = "gray"),
# legend text size and outline
legend.text = element_text(size = 14),
legend.background = element_rect(
fill = "white",
colour = "black",
linewidth = 0.2,
linetype = "solid"
),
# facet title size
strip.text.x = element_text(size = 16),
strip.text.y = element_text(size = 16)
)
## create custom theme for ggplot
theme_nice <- function() {
theme(
panel.grid.minor = element_blank(),
axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
)
}
# setup for graph colors
orange <- "#f46d43"
lightorange <- "#fdae61"
blue <- "#0072B2"
lightblue <- "#74add1"
# setup for graph saving for slides
height = 4
width = 8
# parameters
set.seed(1)DID Machine Learning Approaches
This document is mainly code oriented, and its purpose is to code from scratch, how to estimate a residualized DID regression.
Setup
the dgp
make_data <- function(nobs = 1000,
years = c(1, 10),
time_treat = 5,
rho = 0.2,
seed = 1) {
set.seed(seed)
# create panel data
dat <- CJ(id = 1:nobs, time = years[1]:years[2])
# add constant Xs
X = gendata::genmvnorm(cor = c(rho, rho, rho, rho, rho, rho),
k = 4,
n = nobs) %>% as.data.table()
X[, id := 1:nobs]
dat = dat[X, on = "id"]
# treatment propensity and treatment group
dat[, logit_error := rnorm(1), by = "id"][
, treat_prop := (1/(1+exp(1)^(-(-0.5 + (X3 > 0) + 0.5*X3*X4 + logit_error))))][
, treat := 1*(treat_prop > .5)][
, cohort := fifelse(treat == 1, time_treat, 99)]
# did variables
dat[, time_fe := rnorm(n=1, mean = (time)/5, sd = 1), by = "time"][
, unit_fe := rnorm(1, mean = (treat), sd = 1), by = "id"]
# treatment effect and outcome
dat[, tau := fifelse(time >= cohort & treat == 1, 1*(X1 > 0) + 1*(X2 > 0), 0)][
, cumtau := cumsum(tau), by = "id"][
, error := rnorm(.N, 0, .5)][
, Y0 := unit_fe + time_fe + 0.2*(X4 > 0) + 0.1*X3^2 + 0.1*X2*X1][
, Y := Y0 + cumtau + error]
return(dat)
}look at the data
data = make_data(seed = 1) |> tibble()
data |> filter(time == 1) |> count(treat)# A tibble: 2 × 2
treat n
<dbl> <int>
1 0 491
2 1 509
data = make_data(seed = 1) |> tibble()
data <- data |>
group_by(cohort, time) |>
summarise(Y_mean = mean(Y), .groups = "keep") |>
right_join(data, by = c("cohort", "time"))
data |>
ggplot() +
geom_line(data = data |> filter(treat == 1),
mapping = aes(x = time, y = Y, group = id),
color = "#FC9272", alpha = .2, linewidth = .1) +
geom_line(data = data |> filter(treat == 0),
mapping = aes(x = time, y = Y, group = id),
color = "#9ECAE1", alpha = .2, linewidth = .1) +
geom_line(data = data |> filter(treat == 1),
mapping = aes(x = time, y = Y_mean, color = "Treated"),
alpha = .8, linewidth = 2) +
geom_line(data = data |> filter(treat == 0),
mapping = aes(x = time, y = Y_mean, color = "Control"),
alpha = .8, linewidth = 2) +
theme_nice() +
labs(x = "Time Period") +
geom_vline(xintercept = 4.5, linetype = "dashed") +
scale_x_continuous(breaks = seq(1, 10, 1)) +
scale_color_manual(name = NULL, breaks = c("Treated", "Control"), values = c("#CB181D", "#2171B5"))ggsave(
filename = glue::glue("grf_desc.pdf"),
width = 8,
height = 4
)GRF and DID
splitting a tree
# functions ---------------------------------------------------------------
split_function <- function(df, split_value, split_column = 1) {
split_indexes <- df[,split_column] >= split_value
data_1 <- df[split_indexes,]
data_2 <- df[!split_indexes,]
theta_1 <- lm(Y ~ W, data = data_1)$coefficients[2]
theta_2 <- lm(Y ~ W, data = data_2)$coefficients[2]
weight = nrow(data_1)*nrow(data_2)/(nrow(df)^2)
delta_c <- weight*(theta_1 - theta_2)*2
return(round(delta_c, digits = 2))
}
# graphs ------------------------------------------------------------------
data <- make_data()
# some local parameters
cohort_test = 5; cohort_train = 99; year_pre = 4; year_post = 5;
# some data configuration
data <- data |>
filter(time %in% c(year_pre, year_post)) |>
mutate(W = treat) |>
tibble() |>
mutate(W_char = ifelse(W == 1, "Treatment", "Control"),
time_char = ifelse(time == year_post, "After", "Before"))
# some needed variables
Y_treat_post = data |> filter(time == year_post) |> pull(Y)
Y_treat_pre = data |> filter(time == year_pre) |> pull(Y)
delta_Y = Y_treat_post - Y_treat_pre
W = data |> filter(time == year_post) |> pull(W)
X <- data |> filter(time == year_post) |> select(X1:X4) |> as.matrix()
df <- data.frame(X, W, Y = delta_Y) |>
mutate(W_char = ifelse(W == 1, "Treatment", "Control"))
data |>
ggplot() +
geom_point(aes(x = X1, y = X2, color = Y), stroke = .8, alpha = .8) +
labs(subtitle = "The covariate space") +
theme_nice() +
facet_grid(cols = vars(W_char), rows = vars(time_char)) +
scale_color_viridis_b(name = "Y")# df |>
# ggplot() +
# geom_point(aes(x = X1, y = X2, color = Y), stroke = .8, alpha = .8) +
# labs(subtitle = "The covariate space") +
# theme_nice() +
# facet_grid(cols = vars(W_char), rows = vars(time_char)) +
# scale_color_viridis_b(name = "Delta Y")
ggsave(
filename = "causal_trees_example_1.pdf",
width = 8,
height = 4
)for (split_value in-1:1) {
data |>
ggplot() +
geom_point(aes(x = X1, y = X2, color = Y), stroke = .8, alpha = .8) +
labs(subtitle = "The covariate space") +
theme_nice() +
facet_grid(cols = vars(W_char), rows = vars(time_char)) +
scale_color_viridis_b(name = "Y") +
geom_vline(xintercept = split_value,color = "black",linewidth = 1,linetype = "dashed") +
labs(subtitle = glue::glue("OLS DID Delta(C_1, C_2) = {split_function(df, split_value)}"))
ggsave(
filename = glue::glue("causal_trees_example_{split_value+3}.pdf"),
width = 8,
height = 4
)
}cf <- grf::causal_forest(X = X, W = W, Y = delta_Y)
temp <- cf$predictions |> cbind(X, W)
temp |> as.data.frame() |> rename(tau = V1) |>
filter(W == 1) |>
ggplot(aes(x = X1, y = X2, color = tau)) +
geom_point(stroke = .8, alpha = .8) +
theme_nice() +
labs(subtitle = "GRF CATT estimates") +
scale_color_viridis_b(name = "Tau hat")ggsave(
filename = glue::glue("het_grf.pdf"),
width = 8,
height = 4
)