Regression discontinuity design

pacman::p_load(sf, tidyverse, hrbrthemes, lubridate, DBI, dbplyr, bigrquery, here, estimatr, lubridate, MonteCarlo)
pacman::p_install(rms, force=F)
pacman::p_load(ggplot2)
theme_set(theme_ipsum())
pacman::p_load(rddtools)

ATE, ATT, LATE

a) Discuss why the ATE, ATT and LATE can each be important from a policy or economics perspective, and also how they are distinclty different.

The average treatment effect compares outcomes with and without treatment for the whole population. The local average treatment effect also compares outcomes with and without treatment but is conditional on some characteristic x. The average treatment on the treated compares outcome with treatment and without treatment, exclusively for the treated group.

Policy can potentially care about all and any of these three measures. Naturally, policymakers care about the average treatment effect on the treated group. They might, however, also care about the other two measures. If a policy is targeted towards a certain group, the ATE might still be relevant if we expect spillovers/externalities on those who are not treated. The LATE might also be relevant in cases where the policymaker cares about how the treatment might interact with other characteristics.

b) Simulating treatment :

Generating data

ind = 
    data.frame( 
    i = 1:10000, 
    ei = rnorm(length(1:10000), mean = 0, sd=1),
    Y_pre = rnorm(length(1:10000), mean = 3, sd = 2),
    Y_treated = rnorm(length(1:10000), mean = 5, sd = 20),
    Y_untreated = rnorm(length(1:10000), mean = 3, sd = 2),
    X = rep(0:1, each = 10000/2)
    )
ind <- ind%>% mutate(Yobs = ifelse( X == 1, Y_treated, Y_untreated))

Let’s measure the ATE

ind$ATE <- ind$Y_treated- ind$Y_untreated  
mean(ind$ATE)

## [1] 1.829711

Let’s measure the ATT

ATT <- ind %>% group_by(X) %>% summarise (ATT = mean(ATE))
ATT[2,2]

## # A tibble: 1 x 1
##     ATT
##   <dbl>
## 1  1.59

Let’s look at the observed sample mean differences

obs <- ind %>% group_by(X) %>% summarise (diff_treated = mean(Y_treated-Y_pre), diff_untreated=mean(Y_untreated-Y_pre))
observed_difference <- obs[2,2]-obs[1,3]
observed_difference

##   diff_treated
## 1     1.599341

Comparison:

The observed sample mean difference is close to the average treatment on the treated. This is happening in this simulation because I have designed the average treatment on the untreated to be 0.

Replication exercise

Reading the file

raw <- read.csv(file="bac.csv", header=TRUE, sep=",") %>% arrange(bac)
head(raw)

##        Date Alcohol1 Alcohol2 bac male white recidivism acc aged year
## 1 23 Jun 01        0        0   0    1     1          0   0   25 2001
## 2 11 Oct 99        0        0   0    1     0          1   0   41 1999
## 3 12 Oct 99        0        0   0    1     0          1   0   41 1999
## 4 19 Oct 99        0        0   0    1     0          1   0   41 1999
## 5 04 Oct 07        0        0   0    0     1          0   1   44 2007
## 6 10 Aug 00        0        0   0    1     1          1   0   21 2000

raw$bac <- as.numeric(raw$bac)

a) Histogram

disc <- qplot(raw$bac, 
      geom = "histogram", 
      binwidth = 0.001, 
      main = "BAC distribution", 
      xlab = "BAC", 
      fill = I("black"),
      col = I("grey"), 
      alpha = I(.2),
      xlim = c(0, 0.440)
)
disc + geom_vline (xintercept = 0.08 , color = "blue", size = 0.3) + geom_vline (xintercept = 0.15, color = "blue", size = 0.3)

## Warning: Removed 2 rows containing missing values (geom_bar).

There appears to be sorting at the 0.08 threshold as we can see a significant spike on the right of the threshold.

b) Checking sorting for other variables

raw$threshold <-  as.numeric (raw$bac >=0.08 )
raw$resc_bac <- raw$bac-0.08
str(raw)

## 'data.frame':    214558 obs. of  12 variables:
##  $ Date      : Factor w/ 3286 levels "01 Apr 00","01 Apr 01",..: 2431 1178 1286 2042 421 981 2062 793 3167 1561 ...
##  $ Alcohol1  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Alcohol2  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bac       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ male      : int  1 1 1 1 0 1 1 0 1 1 ...
##  $ white     : int  1 0 0 0 1 1 1 1 1 1 ...
##  $ recidivism: int  0 1 1 1 0 1 0 0 0 0 ...
##  $ acc       : int  0 0 0 0 1 0 0 1 0 1 ...
##  $ aged      : int  25 41 41 41 44 21 38 48 43 50 ...
##  $ year      : int  2001 1999 1999 1999 2007 2000 2001 2001 2006 2004 ...
##  $ threshold : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ resc_bac  : num  -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 -0.08 ...

raw$acc <- as.numeric(raw$acc)
raw$aged <- as.numeric(raw$aged)
raw$male <- as.numeric(raw$male)
raw$white <- as.numeric(raw$white)

rd_model_age <- lm(aged ~ threshold + resc_bac + I(threshold * resc_bac), data = raw)
summary(rd_model_age)

## 
## Call:
## lm(formula = aged ~ threshold + resc_bac + I(threshold * resc_bac), 
##     data = raw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.230  -9.710  -2.315   7.602  47.005 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              34.0375     0.1026 331.651   <2e-16 ***
## threshold                -1.0702     0.1133  -9.448   <2e-16 ***
## resc_bac                -56.0191     2.9773 -18.815   <2e-16 ***
## I(threshold * resc_bac)  83.5301     3.0316  27.553   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.43 on 214554 degrees of freedom
## Multiple R-squared:  0.01249,    Adjusted R-squared:  0.01248 
## F-statistic: 904.6 on 3 and 214554 DF,  p-value: < 2.2e-16

rd_model_acc <- lm(acc ~ threshold + resc_bac + I(threshold * resc_bac), data = raw)
summary(rd_model_acc)

## 
## Call:
## lm(formula = acc ~ threshold + resc_bac + I(threshold * resc_bac), 
##     data = raw)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46549 -0.16725 -0.12433 -0.08818  0.92651 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.076067   0.003148  24.166   <2e-16 ***
## threshold               -0.003707   0.003474  -1.067    0.286    
## resc_bac                -1.544779   0.091313 -16.917   <2e-16 ***
## I(threshold * resc_bac)  2.674453   0.092980  28.764   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3506 on 214554 degrees of freedom
## Multiple R-squared:  0.02142,    Adjusted R-squared:  0.02141 
## F-statistic:  1566 on 3 and 214554 DF,  p-value: < 2.2e-16

rd_model_male <- lm(male ~ threshold + resc_bac + I(threshold * resc_bac), data = raw)
summary(rd_model_male)

## 
## Call:
## lm(formula = male ~ threshold + resc_bac + I(threshold * resc_bac), 
##     data = raw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7970  0.2039  0.2084  0.2121  0.2392 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.791382   0.003660 216.236  < 2e-16 ***
## threshold                0.005725   0.004040   1.417  0.15639    
## resc_bac                 0.209732   0.106171   1.975  0.04822 *  
## I(threshold * resc_bac) -0.311869   0.108109  -2.885  0.00392 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4076 on 214554 degrees of freedom
## Multiple R-squared:  0.0001435,  Adjusted R-squared:  0.0001295 
## F-statistic: 10.26 on 3 and 214554 DF,  p-value: 9.408e-07

rd_model_white <- lm(white ~ threshold + resc_bac + I(threshold * resc_bac), data = raw)
summary(rd_model_white)

## 
## Call:
## lm(formula = white ~ threshold + resc_bac + I(threshold * resc_bac), 
##     data = raw)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9038  0.1277  0.1366  0.1434  0.1655 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.849351   0.003099 274.051   <2e-16 ***
## threshold                0.002149   0.003421   0.628   0.5299    
## resc_bac                 0.185013   0.089909   2.058   0.0396 *  
## I(threshold * resc_bac) -0.010138   0.091550  -0.111   0.9118    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3452 on 214554 degrees of freedom
## Multiple R-squared:  0.0008054,  Adjusted R-squared:  0.0007914 
## F-statistic: 57.64 on 3 and 214554 DF,  p-value: < 2.2e-16

The only variables for which the threshold appears to be significant is age. Therefore, I run a more miticulous RD regression with bandwidth specification below.

frame <- rdd_data(y=raw$age, x=raw$bac, cutpoint = 0.08)
reg_para <- rdd_reg_lm(rdd_object = frame, bw= 0.05)
reg_para

## ### RDD regression: parametric ###
##  Polynomial order:  1 
##  Slopes:  separate 
##  Bandwidth:  0.05 
##  Number of obs: 93792 (left: 20570, right: 73222)
## 
##  Coefficient:
##   Estimate Std. Error t value Pr(>|t|)
## D -0.17462    0.15657 -1.1153   0.2647

The results of this regression show that there is in fact no sorting at the threshold.

c) Running the RD

raw.05 <- raw[ which(raw$resc_bac < 0.05 & raw$resc_bac > -0.05), ]
raw.05$recidivism <- as.numeric(raw.05$recidivism)
rd_model_recid <- lm(recidivism ~ threshold + resc_bac + I(threshold * resc_bac), data = raw.05)
summary(rd_model_recid)

## 
## Call:
## lm(formula = recidivism ~ threshold + resc_bac + I(threshold * 
##     resc_bac), data = raw.05)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12515 -0.11280 -0.10636 -0.09907  0.90608 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.112344   0.003385  33.191  < 2e-16 ***
## threshold               -0.018854   0.004208  -4.480 7.46e-06 ***
## resc_bac                -0.261291   0.177207  -1.474 0.140353    
## I(threshold * resc_bac)  0.690429   0.195995   3.523 0.000427 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3095 on 92154 degrees of freedom
## Multiple R-squared:  0.000541,   Adjusted R-squared:  0.0005085 
## F-statistic: 16.63 on 3 and 92154 DF,  p-value: 8.515e-11

graph <- raw %>% group_by(resc_bac) %>% summarise (avrecid = mean(recidivism)) 

graph3 <- ggplot(graph, aes(x = resc_bac, y =avrecid)) + geom_point() + theme(axis.text.x= element_text(size=7, angle=90)) + scale_x_continuous(name ="BAC (centered around 0.08", limits=c(-0.05,0.13)) + scale_y_continuous(name="Average recidivism", limits=c(0,1))+ geom_vline (xintercept = 0 , color = "blue", size = 0.3)

graph3

## Warning: Removed 217 rows containing missing values (geom_point).

The coefficient is highly significant, and close to the estimate in the paper (-0.021). I failed to reproduce the graph from the paper and settled for this one, because trying to replicate the paper’s graph kept coming out funky.

d) Testing sensitivity to bandwidth

frame <- rdd_data(y=raw$recidivism, x=raw$bac, cutpoint = 0.08)
reg_para <- rdd_reg_lm(rdd_object = frame, bw= 0.05)
reg_para

## ### RDD regression: parametric ###
##  Polynomial order:  1 
##  Slopes:  separate 
##  Bandwidth:  0.05 
##  Number of obs: 93792 (left: 20570, right: 73222)
## 
##  Coefficient:
##     Estimate Std. Error t value  Pr(>|t|)    
## D -0.0187432  0.0041926 -4.4705 7.811e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sensitivity_table <- plotSensi(reg_para, from = 0.01, to= 0.07, by = 0.005, output = c("data"), plot = FALSE, order = 1)
sensitivity_table

##       bw order         LATE          se      CI_low      CI_high
## 1  0.010     1 -0.012179907 0.008630661 -0.02909569  0.004735878
## 2  0.015     1 -0.008362047 0.006953998 -0.02199163  0.005267539
## 3  0.020     1 -0.014724315 0.006091921 -0.02666426 -0.002784369
## 4  0.025     1 -0.015940325 0.005469256 -0.02665987 -0.005220780
## 5  0.030     1 -0.016202432 0.005066444 -0.02613248 -0.006272384
## 6  0.035     1 -0.016248696 0.004771634 -0.02560093 -0.006896466
## 7  0.040     1 -0.014748742 0.004546468 -0.02365966 -0.005837827
## 8  0.045     1 -0.016481872 0.004338799 -0.02498576 -0.007977981
## 9  0.050     1 -0.018743191 0.004192594 -0.02696052 -0.010525857
## 10 0.055     1 -0.018370352 0.004036254 -0.02628126 -0.010459439
## 11 0.060     1 -0.018321070 0.003911378 -0.02598723 -0.010654909
## 12 0.065     1 -0.019345671 0.003814855 -0.02682265 -0.011868693
## 13 0.070     1 -0.019281679 0.003745128 -0.02662199 -0.011941363

sensitivity_graph <- plotSensi(reg_para, from = 0.01, to= 0.07, by = 0.005, output = c("ggplot"), plot = TRUE, order = 1)

The confidence interval above indicates that the direction of the relation is not sensitive to the bandwidth choice (unless the bandwidth choice is extremely small or less than 0.01).

e) Testing the design: randomization inference

set.seed(42)
n <- 92158
nsims <- 1000

coeff_sims <- c()
t_sims <- c()

for(i in 1:nsims){
  resc_bac_sim <- sample(x = raw.05$resc_bac, size = n)
  frame2 <- rdd_data(y=raw.05$recidivism, x=resc_bac_sim, cutpoint = 0)
  model2 <- rdd_reg_lm(rdd_object = frame2)
  coeff_sims[i] <- summary(model2)$coefficients[2,1]
  t_sims[i] <- summary(model2)$coefficients[2,4]
}

summary(coeff_sims)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -1.347e-02 -3.170e-03 -3.116e-04 -3.957e-05  3.082e-03  1.452e-02

summary(t_sims)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0005636 0.2120993 0.4559634 0.4740536 0.7465244 0.9973392

ri_p_value_raw <- mean( abs(coeff_sims) >= abs(summary(reg_para)$coefficients[2, 1]))
ri_p_value_raw

## [1] 0

There’s no evidence of experimental design bias. As per the p-value above, the estimated placebo never exceeds the estimated treatment effect.

THE END