a. Download and read the paper: Carpenter and Dobkin(2009) The Effect of Alcohol Consumption on Mortality: Regression Discontinuity Evidence from the Minimum Drinking Age AEJ: Applied Economics 1(1)164-82

b. Get the Carpenter & Dobkin and data:

# Load STATA file using the foreign package
library(foreign)

# Import dta data
my_data <- read.dta("AEJfigs.dta")
summary(my_data)
##     agecell           all           allfitted         internal    
##  Min.   :19.07   Min.   : 88.43   Min.   : 91.71   Min.   :15.98  
##  1st Qu.:20.08   1st Qu.: 92.79   1st Qu.: 93.04   1st Qu.:18.60  
##  Median :21.00   Median : 95.69   Median : 95.18   Median :20.29  
##  Mean   :21.00   Mean   : 95.67   Mean   : 95.80   Mean   :20.29  
##  3rd Qu.:21.92   3rd Qu.: 98.03   3rd Qu.: 97.79   3rd Qu.:21.98  
##  Max.   :22.93   Max.   :105.27   Max.   :102.89   Max.   :24.37  
##                  NA's   :2                         NA's   :2      
##  internalfitted     external     externalfitted     alcohol      
##  Min.   :16.74   Min.   :71.34   Min.   :73.16   Min.   :0.6391  
##  1st Qu.:18.67   1st Qu.:73.04   1st Qu.:74.06   1st Qu.:0.9962  
##  Median :20.54   Median :74.81   Median :74.74   Median :1.2119  
##  Mean   :20.28   Mean   :75.39   Mean   :75.52   Mean   :1.2573  
##  3rd Qu.:21.66   3rd Qu.:77.24   3rd Qu.:76.06   3rd Qu.:1.4701  
##  Max.   :24.04   Max.   :83.33   Max.   :81.78   Max.   :2.5193  
##                  NA's   :2                       NA's   :2       
##  alcoholfitted       homicide     homicidefitted     suicide     
##  Min.   :0.7943   Min.   :14.95   Min.   :16.26   Min.   :10.89  
##  1st Qu.:1.0724   1st Qu.:16.61   1st Qu.:16.54   1st Qu.:11.61  
##  Median :1.2471   Median :16.99   Median :16.99   Median :12.20  
##  Mean   :1.2674   Mean   :16.91   Mean   :16.95   Mean   :12.35  
##  3rd Qu.:1.4455   3rd Qu.:17.29   3rd Qu.:17.25   3rd Qu.:12.82  
##  Max.   :1.8174   Max.   :18.41   Max.   :17.76   Max.   :14.83  
##                   NA's   :2                       NA's   :2      
##  suicidefitted        mva          mvafitted         drugs      
##  Min.   :11.59   Min.   :26.86   Min.   :27.87   Min.   :3.202  
##  1st Qu.:11.61   1st Qu.:30.12   1st Qu.:30.17   1st Qu.:3.755  
##  Median :12.25   Median :31.64   Median :31.73   Median :4.314  
##  Mean   :12.36   Mean   :31.62   Mean   :31.68   Mean   :4.250  
##  3rd Qu.:13.04   3rd Qu.:33.10   3rd Qu.:33.40   3rd Qu.:4.756  
##  Max.   :13.55   Max.   :36.39   Max.   :34.82   Max.   :5.565  
##                  NA's   :2                       NA's   :2      
##   drugsfitted    externalother    externalotherfitted
##  Min.   :3.449   Min.   : 7.973   Min.   : 8.388     
##  1st Qu.:3.769   1st Qu.: 9.149   1st Qu.: 9.347     
##  Median :4.323   Median : 9.561   Median : 9.690     
##  Mean   :4.255   Mean   : 9.599   Mean   : 9.610     
##  3rd Qu.:4.679   3rd Qu.:10.122   3rd Qu.: 9.939     
##  Max.   :5.130   Max.   :11.483   Max.   :10.353     
##                  NA's   :2
# Create over21 dummy variable: 1 if over 21, else 0
my_data$over21 = ifelse(my_data$agecell >= 21,1,0)

c. Reproduce Figure 3 of the paper

# fig 3 of paper: plot points first
plot(my_data$agecell, my_data$all, col = "gray", pch = 18,
    ylim = c(0, 120), xlab = "Age", ylab = "Deaths per 100,000 person-years")
points(my_data$agecell, my_data$internal, pch = 22, cex = 0.6, col = "darkgray")
points(my_data$agecell, my_data$external, pch = 17, cex = 0.6, col = "black")

# For lines, plot below the cutoff and above the cutoff
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$allfitted, NA),
      lty = 3, lwd = 3, col = "darkgray")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$allfitted, NA),
      lty = 3, lwd = 3, col = "darkgray")
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$internalfitted, NA),
      lty = 5, col = "darkgrey")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$internalfitted, NA),
      lty = 5, col = "darkgrey")
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$externalfitted, NA),
      lty = 1, col = "black")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$externalfitted, NA),
      lty = 1, col = "black")

legend(21, 60, lty = c(0,0,0,3,3,5,5,1,1), pch = c(18,22,17,NA,NA,NA,NA,NA,NA),
       col = c("gray","darkgray","black","darkgray","darkgrey","darkgrey","darkgrey","black","black"),
       legend = c("All","Internal","External",
                  "All fitted", "Internal fitted", "External fitted"),
       ncol = 2, cex = 0.6)

d. Simplest regression-discontinuity design

library(stargazer)
# RDD 
model <- lm(all ~ agecell + over21, data = my_data)
# Create a table with stargazer package
stargazer(model, type = "text", title = "Regression-Discontinuity Design",
          align = TRUE, keep.stat = c("n","rsq"),
          dep.var.labels = c("Discontinuity in Deaths"), covariate.labels = c("agecell", "over21"))
## 
## Regression-Discontinuity Design
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                Discontinuity in Deaths  
## ----------------------------------------
## agecell                -0.975           
##                        (0.632)          
##                                         
## over21                7.663***          
##                        (1.440)          
##                                         
## Constant             112.310***         
##                       (12.668)          
##                                         
## ----------------------------------------
## Observations             48             
## R2                      0.595           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01

This model can be expressed as:
\[ AllDeaths_{i} = \alpha + \beta(AgeCell_i) + \gamma(Over21_i) + \epsilon_{i} \] The table above is the relationship between drinking and mortality. The simple RD regression shows a significant jump in deaths at age 21. The estimated value indicates the effects of alcohol drinking on deaths.

e. Plot the results

# First estimate the fitted value
my_data$modelFit <- predict(model, my_data)

# Then, plot them based on the cutoff point
plot(my_data$agecell, my_data$all, col = "black", pch = 18, ylim = c(85, 110),
     xlab = "Age", ylab = "Deaths per 100,000 person-years", main = "RDD results")
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$modelFit, NA),
      lty = 1, lwd = 3, col = "red")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$modelFit, NA),
      lty = 1, lwd = 3, col = "red")
legend("bottomright", lty=c(0,1,1), pch=c(18,NA,NA), col = c("black", "red", "red"),
       legend = c("All", "Fitted"), cex = 0.6)

f. Allow more flexibility to your RD: Run another RD where not only the intercept, but also the slope can differ

# Run the model
model1 = lm(all ~ agecell + over21 + agecell*over21, data = my_data)
stargazer(model1, type = "text", title = "Regression-Discontinuity Design", align = TRUE,
          keep.stat = c("n","rsq"), dep.var.labels = c("Discontinuity in Deaths"),
          covariate.labels = c("agecell", "over21", "agecell * over21"))
## 
## Regression-Discontinuity Design
## ============================================
##                      Dependent variable:    
##                  ---------------------------
##                    Discontinuity in Deaths  
## --------------------------------------------
## agecell                     0.827           
##                            (0.819)          
##                                             
## over21                    83.333***         
##                           (24.357)          
##                                             
## agecell * over21          -3.603***         
##                            (1.158)          
##                                             
## Constant                  76.251***         
##                           (16.396)          
##                                             
## --------------------------------------------
## Observations                 48             
## R2                          0.668           
## ============================================
## Note:            *p<0.1; **p<0.05; ***p<0.01

This model can be expressed as:
\[ AllDeaths_{i} = \alpha + \beta_1(AgeCell_i) + \beta_2(Over21_i) + \beta_3(AgeCell_i)*(Over21_i) + \epsilon_{i} \]

# Estimate
estimate <- summary(model1)$coefficients[3, 1] +
  summary(model1)$coefficients[4,1] * 21

Here, the estimate of interest is \(\beta_2 + \beta_3(AgeCell_i)*(Over21_i)\) ( = 7.663, which is calculated above). The estimate does not differ from the previous one.

g. Again, plot the data

# First estimate the fitted value
my_data$modelFit1 <- predict(model1, my_data)

# Then, plot them based on the cutoff point
plot(my_data$agecell, my_data$all, col = "black", pch = 18, ylim = c(85, 110),
     xlab = "Age", ylab = "Deaths per 100,000 person-years", main = "RDD results")
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$modelFit1, NA),
      lty = 1, lwd = 3, col = "red")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$modelFit1, NA),
      lty = 1, lwd = 3, col = "red")
legend("bottomright", lty=c(0,1,1), pch=c(18,NA,NA), col = c("black", "red", "red"),
       legend = c("All", "Fitted"), cex = 0.6)

h. Compare to pre-packaged RDD: Use the rdd package

# Using rdd package
library(rdd)
model2 <- RDestimate(all~agecell, data = my_data, cutpoint = 21)
summary(model2)
## 
## Call:
## RDestimate(formula = all ~ agecell, data = my_data, cutpoint = 21)
## 
## Type:
## sharp 
## 
## Estimates:
##            Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
## LATE       1.6561     40            9.001     1.480       6.080    1.199e-09
## Half-BW    0.8281     20            9.579     1.914       5.004    5.609e-07
## Double-BW  3.3123     48            7.953     1.278       6.223    4.882e-10
##               
## LATE       ***
## Half-BW    ***
## Double-BW  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## F-statistics:
##            F      Num. DoF  Denom. DoF  p        
## LATE       33.08  3         36          3.799e-10
## Half-BW    29.05  3         16          2.078e-06
## Double-BW  32.54  3         44          6.129e-11
# Plot output of regression
plot(model2)
title(main =  "RDD with default kernel and bandwidth")

Here, the local average treatment effect is roughly 9, which is higher than what we obtained from the regression above. This can be owed to the default triangular kernel and bandwidth of 1.6. We can argue that with triangular kernel and default bandwidth we were able to better estimate the treatment effect.

i. Prepackaged linear RDD: Re-run an RDestimate command, this time adding other options

# rerun the model with rectangular kernel and bandwidth 2
model3 <- RDestimate(all~agecell, data = my_data, cutpoint = 21, kernel = "rectangular", bw = 2)
summary(model3)
## 
## Call:
## RDestimate(formula = all ~ agecell, data = my_data, cutpoint = 21, 
##     bw = 2, kernel = "rectangular")
## 
## Type:
## sharp 
## 
## Estimates:
##            Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
## LATE       2          48            7.663     1.273       6.017    1.776e-09
## Half-BW    1          24            9.753     1.902       5.128    2.929e-07
## Double-BW  4          48            7.663     1.273       6.017    1.776e-09
##               
## LATE       ***
## Half-BW    ***
## Double-BW  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## F-statistics:
##            F      Num. DoF  Denom. DoF  p        
## LATE       29.47  3         44          2.651e-10
## Half-BW    16.82  3         20          2.167e-05
## Double-BW  29.47  3         44          2.651e-10
# Plot output of regression
plot(model3)
title(main = "RDD with rectangular kernel and higher bandwidth")

Here, the local average treatment effect is around 7.66, which is similar to the one we obtained from the regression above. We used rectangular kernel and higher bandwidth of 2 and thus can argue that with these options we have smaller standard error compared to that in part h above but the estimation does not improve.