pacman::p_load(sf, tidyverse, hrbrthemes, lubridate, DBI, dbplyr, bigrquery, here, estimatr, lubridate, MonteCarlo)
pacman::p_install(rms, force=F)
pacman::p_load(ggplot2)
theme_set(theme_ipsum())
pacman::p_load(rddtools)
The average treatment effect compares outcomes with and without treatment for the whole population. The local average treatment effect also compares outcomes with and without treatment but is conditional on some characteristic x. The average treatment on the treated compares outcome with treatment and without treatment, exclusively for the treated group.
Policy can potentially care about all and any of these three measures. Naturally, policymakers care about the average treatment effect on the treated group. They might, however, also care about the other two measures. If a policy is targeted towards a certain group, the ATE might still be relevant if we expect spillovers/externalities on those who are not treated. The LATE might also be relevant in cases where the policymaker cares about how the treatment might interact with other characteristics.
Generating data
ind =
data.frame(
i = 1:10000,
ei = rnorm(length(1:10000), mean = 0, sd=1),
Y_pre = rnorm(length(1:10000), mean = 3, sd = 2),
Y_treated = rnorm(length(1:10000), mean = 5, sd = 20),
Y_untreated = rnorm(length(1:10000), mean = 3, sd = 2),
X = rep(0:1, each = 10000/2)
)
ind <- ind%>% mutate(Yobs = ifelse( X == 1, Y_treated, Y_untreated))
Let’s measure the ATE
ind$ATE <- ind$Y_treated- ind$Y_untreated
mean(ind$ATE)
## [1] 1.829711
Let’s measure the ATT
ATT <- ind %>% group_by(X) %>% summarise (ATT = mean(ATE))
ATT[2,2]
## # A tibble: 1 x 1
## ATT
## <dbl>
## 1 1.59
Let’s look at the observed sample mean differences
obs <- ind %>% group_by(X) %>% summarise (diff_treated = mean(Y_treated-Y_pre), diff_untreated=mean(Y_untreated-Y_pre))
observed_difference <- obs[2,2]-obs[1,3]
observed_difference
## diff_treated
## 1 1.599341
Comparison:
The observed sample mean difference is close to the average treatment on the treated. This is happening in this simulation because I have designed the average treatment on the untreated to be 0.
raw <- read.csv(file="bac.csv", header=TRUE, sep=",") %>% arrange(bac)
head(raw)
## Date Alcohol1 Alcohol2 bac male white recidivism acc aged year
## 1 23 Jun 01 0 0 0 1 1 0 0 25 2001
## 2 11 Oct 99 0 0 0 1 0 1 0 41 1999
## 3 12 Oct 99 0 0 0 1 0 1 0 41 1999
## 4 19 Oct 99 0 0 0 1 0 1 0 41 1999
## 5 04 Oct 07 0 0 0 0 1 0 1 44 2007
## 6 10 Aug 00 0 0 0 1 1 1 0 21 2000
raw$bac <- as.numeric(raw$bac)
disc <- qplot(raw$bac,
geom = "histogram",
binwidth = 0.001,
main = "BAC distribution",
xlab = "BAC",
fill = I("black"),
col = I("grey"),
alpha = I(.2),
xlim = c(0, 0.440)
)
disc + geom_vline (xintercept = 0.08 , color = "blue", size = 0.3) + geom_vline (xintercept = 0.15, color = "blue", size = 0.3)
## Warning: Removed 2 rows containing missing values (geom_bar).
There appears to be sorting at the 0.08 threshold as we can see a significant spike on the right of the threshold.
raw$threshold <- as.numeric (raw$bac >=0.08 )
raw$resc_bac <- raw$bac-0.08
str(raw)
## 'data.frame': 214558 obs. of 12 variables:
## $ Date : Factor w/ 3286 levels "01 Apr 00","01 Apr 01",..: 2431 1178 1286 2042 421 981 2062 793 3167 1561 ...
## $ Alcohol1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Alcohol2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ bac : num 0 0 0 0 0 0 0 0 0 0 ...
## $ male : int 1 1 1 1 0 1 1 0 1 1 ...
## $ white : int 1 0 0 0 1 1 1 1 1 1 ...
## $ recidivism: int 0 1 1 1 0 1 0 0 0 0 ...
## $ acc : int 0 0 0 0 1 0 0 1 0 1 ...
## $ aged : int 25 41 41 41 44 21 38 48 43 50 ...
## $ year : int 2001 1999 1999 1999 2007 2000 2001 2001 2006 2004 ...
## $ threshold : num 0 0 0 0 0 0 0 0 0 0 ...
## $ resc_bac : num -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 ...
raw$acc <- as.numeric(raw$acc)
raw$aged <- as.numeric(raw$aged)
raw$male <- as.numeric(raw$male)
raw$white <- as.numeric(raw$white)
rd_model_age <- lm(aged ~ threshold + resc_bac + I(threshold * resc_bac), data = raw)
summary(rd_model_age)
##
## Call:
## lm(formula = aged ~ threshold + resc_bac + I(threshold * resc_bac),
## data = raw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.230 -9.710 -2.315 7.602 47.005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.0375 0.1026 331.651 <2e-16 ***
## threshold -1.0702 0.1133 -9.448 <2e-16 ***
## resc_bac -56.0191 2.9773 -18.815 <2e-16 ***
## I(threshold * resc_bac) 83.5301 3.0316 27.553 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.43 on 214554 degrees of freedom
## Multiple R-squared: 0.01249, Adjusted R-squared: 0.01248
## F-statistic: 904.6 on 3 and 214554 DF, p-value: < 2.2e-16
rd_model_acc <- lm(acc ~ threshold + resc_bac + I(threshold * resc_bac), data = raw)
summary(rd_model_acc)
##
## Call:
## lm(formula = acc ~ threshold + resc_bac + I(threshold * resc_bac),
## data = raw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46549 -0.16725 -0.12433 -0.08818 0.92651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.076067 0.003148 24.166 <2e-16 ***
## threshold -0.003707 0.003474 -1.067 0.286
## resc_bac -1.544779 0.091313 -16.917 <2e-16 ***
## I(threshold * resc_bac) 2.674453 0.092980 28.764 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3506 on 214554 degrees of freedom
## Multiple R-squared: 0.02142, Adjusted R-squared: 0.02141
## F-statistic: 1566 on 3 and 214554 DF, p-value: < 2.2e-16
rd_model_male <- lm(male ~ threshold + resc_bac + I(threshold * resc_bac), data = raw)
summary(rd_model_male)
##
## Call:
## lm(formula = male ~ threshold + resc_bac + I(threshold * resc_bac),
## data = raw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7970 0.2039 0.2084 0.2121 0.2392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.791382 0.003660 216.236 < 2e-16 ***
## threshold 0.005725 0.004040 1.417 0.15639
## resc_bac 0.209732 0.106171 1.975 0.04822 *
## I(threshold * resc_bac) -0.311869 0.108109 -2.885 0.00392 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4076 on 214554 degrees of freedom
## Multiple R-squared: 0.0001435, Adjusted R-squared: 0.0001295
## F-statistic: 10.26 on 3 and 214554 DF, p-value: 9.408e-07
rd_model_white <- lm(white ~ threshold + resc_bac + I(threshold * resc_bac), data = raw)
summary(rd_model_white)
##
## Call:
## lm(formula = white ~ threshold + resc_bac + I(threshold * resc_bac),
## data = raw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9038 0.1277 0.1366 0.1434 0.1655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.849351 0.003099 274.051 <2e-16 ***
## threshold 0.002149 0.003421 0.628 0.5299
## resc_bac 0.185013 0.089909 2.058 0.0396 *
## I(threshold * resc_bac) -0.010138 0.091550 -0.111 0.9118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3452 on 214554 degrees of freedom
## Multiple R-squared: 0.0008054, Adjusted R-squared: 0.0007914
## F-statistic: 57.64 on 3 and 214554 DF, p-value: < 2.2e-16
The only variables for which the threshold appears to be significant is age. Therefore, I run a more miticulous RD regression with bandwidth specification below.
frame <- rdd_data(y=raw$age, x=raw$bac, cutpoint = 0.08)
reg_para <- rdd_reg_lm(rdd_object = frame, bw= 0.05)
reg_para
## ### RDD regression: parametric ###
## Polynomial order: 1
## Slopes: separate
## Bandwidth: 0.05
## Number of obs: 93792 (left: 20570, right: 73222)
##
## Coefficient:
## Estimate Std. Error t value Pr(>|t|)
## D -0.17462 0.15657 -1.1153 0.2647
The results of this regression show that there is in fact no sorting at the threshold.
raw.05 <- raw[ which(raw$resc_bac < 0.05 & raw$resc_bac > -0.05), ]
raw.05$recidivism <- as.numeric(raw.05$recidivism)
rd_model_recid <- lm(recidivism ~ threshold + resc_bac + I(threshold * resc_bac), data = raw.05)
summary(rd_model_recid)
##
## Call:
## lm(formula = recidivism ~ threshold + resc_bac + I(threshold *
## resc_bac), data = raw.05)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12515 -0.11280 -0.10636 -0.09907 0.90608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.112344 0.003385 33.191 < 2e-16 ***
## threshold -0.018854 0.004208 -4.480 7.46e-06 ***
## resc_bac -0.261291 0.177207 -1.474 0.140353
## I(threshold * resc_bac) 0.690429 0.195995 3.523 0.000427 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3095 on 92154 degrees of freedom
## Multiple R-squared: 0.000541, Adjusted R-squared: 0.0005085
## F-statistic: 16.63 on 3 and 92154 DF, p-value: 8.515e-11
graph <- raw %>% group_by(resc_bac) %>% summarise (avrecid = mean(recidivism))
graph3 <- ggplot(graph, aes(x = resc_bac, y =avrecid)) + geom_point() + theme(axis.text.x= element_text(size=7, angle=90)) + scale_x_continuous(name ="BAC (centered around 0.08", limits=c(-0.05,0.13)) + scale_y_continuous(name="Average recidivism", limits=c(0,1))+ geom_vline (xintercept = 0 , color = "blue", size = 0.3)
graph3
## Warning: Removed 217 rows containing missing values (geom_point).
The coefficient is highly significant, and close to the estimate in the paper (-0.021). I failed to reproduce the graph from the paper and settled for this one, because trying to replicate the paper’s graph kept coming out funky.
frame <- rdd_data(y=raw$recidivism, x=raw$bac, cutpoint = 0.08)
reg_para <- rdd_reg_lm(rdd_object = frame, bw= 0.05)
reg_para
## ### RDD regression: parametric ###
## Polynomial order: 1
## Slopes: separate
## Bandwidth: 0.05
## Number of obs: 93792 (left: 20570, right: 73222)
##
## Coefficient:
## Estimate Std. Error t value Pr(>|t|)
## D -0.0187432 0.0041926 -4.4705 7.811e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sensitivity_table <- plotSensi(reg_para, from = 0.01, to= 0.07, by = 0.005, output = c("data"), plot = FALSE, order = 1)
sensitivity_table
## bw order LATE se CI_low CI_high
## 1 0.010 1 -0.012179907 0.008630661 -0.02909569 0.004735878
## 2 0.015 1 -0.008362047 0.006953998 -0.02199163 0.005267539
## 3 0.020 1 -0.014724315 0.006091921 -0.02666426 -0.002784369
## 4 0.025 1 -0.015940325 0.005469256 -0.02665987 -0.005220780
## 5 0.030 1 -0.016202432 0.005066444 -0.02613248 -0.006272384
## 6 0.035 1 -0.016248696 0.004771634 -0.02560093 -0.006896466
## 7 0.040 1 -0.014748742 0.004546468 -0.02365966 -0.005837827
## 8 0.045 1 -0.016481872 0.004338799 -0.02498576 -0.007977981
## 9 0.050 1 -0.018743191 0.004192594 -0.02696052 -0.010525857
## 10 0.055 1 -0.018370352 0.004036254 -0.02628126 -0.010459439
## 11 0.060 1 -0.018321070 0.003911378 -0.02598723 -0.010654909
## 12 0.065 1 -0.019345671 0.003814855 -0.02682265 -0.011868693
## 13 0.070 1 -0.019281679 0.003745128 -0.02662199 -0.011941363
sensitivity_graph <- plotSensi(reg_para, from = 0.01, to= 0.07, by = 0.005, output = c("ggplot"), plot = TRUE, order = 1)
The confidence interval above indicates that the direction of the relation is not sensitive to the bandwidth choice (unless the bandwidth choice is extremely small or less than 0.01).
set.seed(42)
n <- 92158
nsims <- 1000
coeff_sims <- c()
t_sims <- c()
for(i in 1:nsims){
resc_bac_sim <- sample(x = raw.05$resc_bac, size = n)
frame2 <- rdd_data(y=raw.05$recidivism, x=resc_bac_sim, cutpoint = 0)
model2 <- rdd_reg_lm(rdd_object = frame2)
coeff_sims[i] <- summary(model2)$coefficients[2,1]
t_sims[i] <- summary(model2)$coefficients[2,4]
}
summary(coeff_sims)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.347e-02 -3.170e-03 -3.116e-04 -3.957e-05 3.082e-03 1.452e-02
summary(t_sims)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0005636 0.2120993 0.4559634 0.4740536 0.7465244 0.9973392
ri_p_value_raw <- mean( abs(coeff_sims) >= abs(summary(reg_para)$coefficients[2, 1]))
ri_p_value_raw
## [1] 0
There’s no evidence of experimental design bias. As per the p-value above, the estimated placebo never exceeds the estimated treatment effect.
THE END