Homework 6 - Regression discontinuity

Tasks

b. Load data

# Load dataset
df <- read.dta("AEJfigs.dta")
head(df)

##    agecell      all allfitted internal internalfitted external externalfitted
## 1 19.06849 92.82540  91.70615 16.61759       16.73813 76.20782       74.96801
## 2 19.15068 95.10074  91.88372 18.32768       16.92065 76.77306       74.96307
## 3 19.23288 92.14429  92.04906 18.91105       17.09884 73.23324       74.95023
## 4 19.31507 88.42776  92.20214 16.10177       17.27268 72.32598       74.92947
## 5 19.39726 88.70494  92.34292 17.36352       17.44216 71.34142       74.90076
## 6 19.47945 90.19179  92.47134 17.87210       17.60725 72.31968       74.86409
##     alcohol alcoholfitted homicide homicidefitted  suicide suicidefitted
## 1 0.6391380     0.7943445 16.31682       16.28457 11.20371      11.59210
## 2 0.6774093     0.8375749 16.85996       16.27070 12.19337      11.59361
## 3 0.8664426     0.8778347 15.21925       16.26288 11.71581      11.59513
## 4 0.8673084     0.9151149 16.74282       16.26115 11.27501      11.59665
## 5 1.0191631     0.9494066 14.94773       16.26551 10.98431      11.59819
## 6 1.1713219     0.9807007 15.64281       16.27599 12.16663      11.59973
##        mva mvafitted    drugs drugsfitted externalother externalotherfitted
## 1 35.82933  34.81778 3.872425    3.448835      8.534373            8.388236
## 2 35.63926  34.63389 3.236511    3.470022      8.655786            8.530174
## 3 34.20565  34.44674 3.202071    3.492069      8.513741            8.662681
## 4 32.27896  34.25630 3.280689    3.514980      8.258285            8.785728
## 5 32.65097  34.06259 3.548198    3.538755      8.417533            8.899288
## 6 32.72144  33.86558 3.211689    3.563399      7.972546            9.003332

# Create over21 dummy
df$over21 <- ifelse(df$agecell >= 21, 1, 0)

c. Reproduce Figure 3 of the paper

plot(df$agecell, df$all, 
             xlab = "Age",
             xaxt = 'n',
             ylab = "Death per 100,000 persons-years",
             ylim = c(0, 120),
             col = "#626669", pch = 18)
axis (side = 1, at = seq(19, 23, by =  0.5))
points(df$agecell, df$internal, pch = 22, cex = 0.6, col = "darkgray") 
points(df$agecell, df$external, pch = 17, cex = 0.6, col = "black")

lines(df$agecell, ifelse(df$over21 == 1, df$allfitted, NA), 
      lty = 3, lwd = 2, col = "darkgray")
lines(df$agecell, ifelse(df$over21 == 0, df$allfitted, NA), 
      lty = 3, lwd = 2, col = "darkgray")
lines(df$agecell, ifelse(df$over21 == 1, df$internalfitted, NA),
      lty = 5, col = "darkgrey")
lines(df$agecell, ifelse(df$over21 == 0, df$internalfitted, NA),
      lty = 5, col = "darkgrey")
lines(df$agecell, ifelse(df$over21 == 1, df$externalfitted, NA),
      lty = 1, col = "black")
lines(df$agecell, ifelse(df$over21 == 0, df$externalfitted, NA),
      lty = 1, col = "black")
  
legend(21.60, 60, lty = c(0,0,0,3,3,5,5,1,1), pch = c(18,22,17,NA,NA,NA,NA,NA,NA),
       col = c("#626669","darkgray","black","darkgray","darkgrey","darkgrey","darkgrey","black","black"),
       legend = c("All","Internal","External",
                  "All fitted", "Internal fitted", "External fitted"),
       ncol = 2, cex = 0.6)

d. Simplest regression-discontinuity design

mod <- lm(all ~ agecell + over21, data = df)
stargazer(mod, type = "text", title = "Regression-Discontinuity Design",
          align = TRUE, keep.stat = c("n","rsq"),
          dep.var.labels = c("Discontinuity in Deaths"), covariate.labels = c("agecell", "over21"))

## 
## Regression-Discontinuity Design
## ========================================
##                  Dependent variable:    
##              ---------------------------
##                Discontinuity in Deaths  
## ----------------------------------------
## agecell                -0.975           
##                        (0.632)          
##                                         
## over21                7.663***          
##                        (1.440)          
##                                         
## Constant             112.310***         
##                       (12.668)          
##                                         
## ----------------------------------------
## Observations             48             
## R2                      0.595           
## ========================================
## Note:        *p<0.1; **p<0.05; ***p<0.01

The coefficient of dummy variable over21 is the treatment effect. It can be interpreted as on average, the mortality rate per 100,000 individuals reaching the minimum drinking age is 7.66 times higher.

e. Plot the results

plot(df$agecell, df$all, 
             xlab = "Age",
             xaxt = 'n',
             ylab = "Death per 100,000 persons-years",
             ylim = c(80, 110),
             col = "#626669", pch = 18)
axis (side = 1, at = seq(19, 23, by =  0.5))

df$modPredict <- predict(mod, df)  #getting the fitted values

lines(df$agecell, ifelse(df$over21 == 1, df$modPredict, NA), 
      lty = 1, lwd = 2, col = "#FF4500")
lines(df$agecell, ifelse(df$over21 == 0, df$modPredict, NA), 
      lty = 1, lwd = 2, col = "#FF4500")
legend(22.5, 90, lty = c(0,1,1), pch = c(18,NA,NA), col = c("#626669", "#FF4500", "#FF4500"), legend = c("All", "Fitted"), cex = 0.6)

f. Allow more flexibility to your RD : Run another RD where not only the intercept, but also the slope can differ

mod1 <- lm(all ~ agecell + over21 + agecell*over21, data = df)
stargazer(mod1, type = "text", title = "Regression-Discontinuity Design",
          align = TRUE, keep.stat = c("n","rsq"),
          dep.var.labels = c("Discontinuity in Deaths"), covariate.labels = c("agecell", "over21", "agecell*over21"))

## 
## Regression-Discontinuity Design
## ==========================================
##                    Dependent variable:    
##                ---------------------------
##                  Discontinuity in Deaths  
## ------------------------------------------
## agecell                   0.827           
##                          (0.819)          
##                                           
## over21                  83.333***         
##                         (24.357)          
##                                           
## agecell*over21          -3.603***         
##                          (1.158)          
##                                           
## Constant                76.251***         
##                         (16.396)          
##                                           
## ------------------------------------------
## Observations               48             
## R2                        0.668           
## ==========================================
## Note:          *p<0.1; **p<0.05; ***p<0.01

estimate <- summary(mod1)$coefficients[3, 1] +
  summary(mod1)$coefficients[4,1] * 21
estimate

## [1] 7.662709

Here our interest is on the the sum of coefficients of over21 and (agecell * over21). The sum is 7.633 which is equal to the coefficient on d. So, there is no any change in the interpretation.

g. Again, plot the data

plot(df$agecell, df$all, 
             xlab = "Age",
             xaxt = 'n',
             ylab = "Death per 100,000 persons-years",
             ylim = c(80, 110),
             col = "#626669", pch = 18)
axis (side = 1, at = seq(19, 23, by =  0.5))

df$modPredict <- predict(mod1, df)  #getting the fitted values

lines(df$agecell, ifelse(df$over21 == 1, df$modPredict, NA), 
      lty = 1, lwd = 2, col = "#FF4500")
lines(df$agecell, ifelse(df$over21 == 0, df$modPredict, NA), 
      lty = 1, lwd = 2, col = "#FF4500")
legend(22.5, 90, lty = c(0,1,1), pch = c(18,NA,NA), col = c("#626669", "#FF4500", "#FF4500"), legend = c("All", "Fitted"), cex = 0.6)

h. Compare to pre-packaged RDD

modrdd <- RDestimate(all ~ agecell, data = df, cutpoint = 21)
summary(modrdd)

## 
## Call:
## RDestimate(formula = all ~ agecell, data = df, cutpoint = 21)
## 
## Type:
## sharp 
## 
## Estimates:
##            Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
## LATE       1.6561     40            9.001     1.480       6.080    1.199e-09
## Half-BW    0.8281     20            9.579     1.914       5.004    5.609e-07
## Double-BW  3.3123     48            7.953     1.278       6.223    4.882e-10
##               
## LATE       ***
## Half-BW    ***
## Double-BW  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## F-statistics:
##            F      Num. DoF  Denom. DoF  p        
## LATE       33.08  3         36          3.799e-10
## Half-BW    29.05  3         16          2.078e-06
## Double-BW  32.54  3         44          6.129e-11

plot(modrdd)

Using a bandwidth of 1.65 and default triangular kernel, the estimated average treatment effect is 9.01 which is higher than what we got in part d.

i. Prepackaged linear RDD

modrdd1 <- RDestimate(all ~ agecell, data = df, cutpoint = 21, 
                     kernel = "rectangular", bw = 2)
summary(modrdd1)

## 
## Call:
## RDestimate(formula = all ~ agecell, data = df, cutpoint = 21, 
##     bw = 2, kernel = "rectangular")
## 
## Type:
## sharp 
## 
## Estimates:
##            Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
## LATE       2          48            7.663     1.273       6.017    1.776e-09
## Half-BW    1          24            9.753     1.902       5.128    2.929e-07
## Double-BW  4          48            7.663     1.273       6.017    1.776e-09
##               
## LATE       ***
## Half-BW    ***
## Double-BW  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## F-statistics:
##            F      Num. DoF  Denom. DoF  p        
## LATE       29.47  3         44          2.651e-10
## Half-BW    16.82  3         20          2.167e-05
## Double-BW  29.47  3         44          2.651e-10

plot(modrdd1)

Here, with the band width of 2 and rectangular kernel, the average treatment effect is 7.66 which is similar to what we got in part d and f but smaller than part f.