a. Download and read the paper:

Carpenter, C., & Dobkin, C. (2009). The effect of alcohol consumption on mortality: regression discontinuity evidence from the minimum drinking age. American Economic Journal: Applied Economics, 1(1), 164-82.

b. Get the Carpenter & Dobkin and data:

Download Data

Can be downloaded straight here http://masteringmetrics.com/wp-content/uploads/2015/01/AEJfigs.dta

Load and summarise data

rm(list = ls())
setwd("C:\\Users\\Akash\\Dropbox\\UGA\\AAEC8610AdvEcotrix_Filipski\\FilipskiHW10")
library(tidyverse)
library(haven)
library(stargazer)
hw10data <- read_dta(file = "AEJfigs.dta")
summary(hw10data)

    agecell           all           allfitted         internal    
 Min.   :19.07   Min.   : 88.43   Min.   : 91.71   Min.   :15.98  
 1st Qu.:20.08   1st Qu.: 92.79   1st Qu.: 93.04   1st Qu.:18.60  
 Median :21.00   Median : 95.69   Median : 95.18   Median :20.29  
 internalfitted     external     externalfitted     alcohol      
 Min.   :16.74   Min.   :71.34   Min.   :73.16   Min.   :0.6391  
 1st Qu.:18.67   1st Qu.:73.04   1st Qu.:74.06   1st Qu.:0.9962  
 Median :20.54   Median :74.81   Median :74.74   Median :1.2119  
 alcoholfitted       homicide     homicidefitted     suicide     
 Min.   :0.7943   Min.   :14.95   Min.   :16.26   Min.   :10.89  
 1st Qu.:1.0724   1st Qu.:16.61   1st Qu.:16.54   1st Qu.:11.61  
 Median :1.2471   Median :16.99   Median :16.99   Median :12.20  
 suicidefitted        mva          mvafitted         drugs      
 Min.   :11.59   Min.   :26.86   Min.   :27.87   Min.   :3.202  
 1st Qu.:11.61   1st Qu.:30.12   1st Qu.:30.17   1st Qu.:3.755  
 Median :12.25   Median :31.64   Median :31.73   Median :4.314  
  drugsfitted    externalother    externalotherfitted
 Min.   :3.449   Min.   : 7.973   Min.   : 8.388     
 1st Qu.:3.769   1st Qu.: 9.149   1st Qu.: 9.347     
 Median :4.323   Median : 9.561   Median : 9.690     
 [ reached getOption("max.print") -- omitted 4 rows ]

Create subset data with needed variables

QUESTION: Look over the data, understand what the different variables are. The key variables you need are agecell, all, internal, external, as well as the versions of those variables that have the suffix fitted.

hw10data <- subset(hw10data, select = c("agecell", "all", "allfitted",
                                        "internal", "internalfitted",
                                        "external","externalfitted"))

head(hw10data)

# A tibble: 6 x 7
  agecell   all allfitted internal internalfitted external externalfitted
    <dbl> <dbl>     <dbl>    <dbl>          <dbl>    <dbl>          <dbl>
1    19.1  92.8      91.7     16.6           16.7     76.2           75.0
2    19.2  95.1      91.9     18.3           16.9     76.8           75.0
3    19.2  92.1      92.0     18.9           17.1     73.2           75.0
4    19.3  88.4      92.2     16.1           17.3     72.3           74.9
5    19.4  88.7      92.3     17.4           17.4     71.3           74.9
6    19.5  90.2      92.5     17.9           17.6     72.3           74.9

Make Over21 dummy variable:

QUESTION: Make an over21 dummy to help with the following analysis.

hw10data$over21 =  as.numeric(hw10data$agecell>=21)
#length(hw10data$agecell)
#length(hw10data$all[!is.na(hw10data$all)])
hw10data <- na.omit(hw10data)
summary(hw10data)

    agecell           all           allfitted         internal    
 Min.   :19.07   Min.   : 88.43   Min.   : 91.71   Min.   :15.98  
 1st Qu.:20.03   1st Qu.: 92.79   1st Qu.: 93.01   1st Qu.:18.60  
 Median :21.00   Median : 95.69   Median : 95.18   Median :20.29  
 Mean   :21.00   Mean   : 95.67   Mean   : 95.71   Mean   :20.29  
 3rd Qu.:21.97   3rd Qu.: 98.03   3rd Qu.: 97.69   3rd Qu.:21.98  
 Max.   :22.93   Max.   :105.27   Max.   :102.59   Max.   :24.37  
 internalfitted     external     externalfitted      over21   
 Min.   :16.74   Min.   :71.34   Min.   :73.23   Min.   :0.0  
 1st Qu.:18.61   1st Qu.:73.04   1st Qu.:74.07   1st Qu.:0.0  
 Median :20.51   Median :74.81   Median :74.74   Median :0.5  
 Mean   :20.27   Mean   :75.39   Mean   :75.44   Mean   :0.5  
 3rd Qu.:21.72   3rd Qu.:77.24   3rd Qu.:75.89   3rd Qu.:1.0  
 Max.   :24.04   Max.   :83.33   Max.   :81.48   Max.   :1.0

#a =mean(hw10data$all[hw10data$over21==0])
#b =mean(hw10data$all[hw10data$over21==1])
#c = b-a  # First Difference or our naive estimate of the effect of intended MLDA on mortality
## But it is not that simple .

c. Reproduce Figure 3 of the paper

attach(hw10data)
Break.point = 21.00
hw10data <- hw10data[order(hw10data$agecell),]



# Define the position of tick marks
v1 <- c(19, 19.5,20, 20.5, 21, 21.5, 22, 22.5, 23)

# Define the labels of tick marks
v2 <- c("19", "19.5", "20", "20.5", "21", "21.5", "22", "22.5", "23")

plot(agecell, 
     all, 
     type="n", 
     axes=FALSE,
     #ann=FALSE, ## Removes label of axis
     xaxt = "n",
     xlim=c(19, 23), 
     ylim = c(0,120),
     xlab = "Age",
     ylab = "Deaths per 100,000 person-years",
     frame.plot = FALSE,
     xaxs="i",yaxs="i")

# Add a title
title("Figure 3. Age Profile for Death Rates")

#Draw a box
#box()
#pos=0, lwd.ticks=0
# Add an axis to the plot 
axis(side = 1, 
     at = v1, 
     labels = v2,
     tck=-.03,pos=0,
     las=0,cex.axis=1.05,
     font.axis=5)
# Add (modified) axes to the plot
axis(2, las=2)
axis(side = 2, 
     tck=-.03,pos=0,
     las=2,cex.axis=1.05,
     font.axis=5)

#We then fit two models, the second has a quadratic age term.

#fit=lm(all~agecell, data = dat.1)
#fit2=lm(all~agecell+I(agecell^2), data=dat.2)
#summary(fit)
#summary(fit2)
#hw10data$predfit = predict(fit, dat.1)
#hw10data$predfit2=predict(fit2,dat.2)


points(agecell, 
       all, 
       col="gray", 
       pch=18,cex = .7)
with(subset(hw10data,agecell < Break.point),
     lines(agecell,allfitted, lty = 3))
with(subset(hw10data,agecell  >= Break.point),
    lines(agecell,allfitted,lty = 3))




points(agecell, 
       internal, 
       pch=0,cex = .4)
with(subset(hw10data,agecell < Break.point),
     lines(agecell,internalfitted, lty = 2, col="azure4", lwd=2))
with(subset(hw10data,agecell  >= Break.point),
    lines(agecell,internalfitted,lty = 2, col="azure4", lwd=2))


points(agecell, 
       external, 
       col="black", 
       pch=17,cex = .7)
with(subset(hw10data,agecell < Break.point),
     lines(agecell,externalfitted, lty = 1))
with(subset(hw10data,agecell  >= Break.point),
    lines(agecell,externalfitted,lty = 1))


# Add subtitles for legend
legend(21.5,60,
       inset=.05,
       cex = 0.7,
       c("All","Internal", "External","All fitted" ,"Internal fitted","External fitted"),
       pch = c(18,0,17,NA,NA,NA),
       pt.cex= c(0.9,0.9,0.9,NA,NA,NA),
       lty=c(NA,NA,NA,3,2,1), 
       ncol = 2,
       lwd=c(NA,NA,NA,1,1,1),
       col=c("gray","black","black", "azure4","azure4","black")
)

d. Simplest regression-discontinuity design

Run “Simple RDD”:

QUESTION: Run a simple RD regression (same slope, with a jump) for “all” deaths by age.

simple.rd = lm(all ~ agecell+over21, data = hw10data)
hw10data$pred <- predict(simple.rd)
summary(simple.rd)


Call:
lm(formula = all ~ agecell + over21, data = hw10data)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.0559 -1.8483  0.1149  1.4909  5.8043 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 112.3097    12.6681   8.866 1.96e-11 ***
agecell      -0.9747     0.6325  -1.541     0.13    
over21        7.6627     1.4403   5.320 3.15e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.493 on 45 degrees of freedom
Multiple R-squared:  0.5946,    Adjusted R-squared:  0.5765 
F-statistic: 32.99 on 2 and 45 DF,  p-value: 1.508e-09

#summary(lm(all ~ over21, data = hw10data))

Anlyse the results?:

QUESTION : Analyse the results. How do you use these results to estimate the relationship between drinking and mortality?

Analysis: We ran a simple OLS regression using the cut-point or jump to estimate the effect of the treatment effect. In particular, the regression looks like: \[ \widehat{all.deaths} = \hat{\beta_0} +\hat{\beta_1} * Agecell +\hat{\beta_2} * Over21 \]
- \(\hat{\beta_0}\) is the intercept that measures the average mortality rate for the comparison group at the discontinuity/jump. In Our case it is 112.31 deaths per 100,000 observations.
- The parameter \(\hat{\beta_2}\) is the effect of treatment (average treatment effect) or the difference in mortality between those above 21 and those below 21. It captures the jump in deaths at age 21. Above, we obtain an an estimate of \(\hat{\beta_2}\) equal to 7.7, ceteris paribus. This estimate indicates a substantial increase in mortality risk at the MLDA cutoff.
- The parameter \(\hat{\beta_1}\) is the underlying trend that accounts for the relationship between agecell and mortality risk. It is statistically insignifcant here.

e. Plot the results

#summary(hw10data)
plot(hw10data$agecell,hw10data$all, pch=17, frame.plot = FALSE,
     xaxs="i",yaxs="i",
     xlim=c(19,23), ylim=c(85,110),
     main = "A Simple RD regression  : \nFitted line for subsamples",
     xlab = "Age", ylab =" 'All' Deaths ", cex.main=1, font.main=1)
with(subset(hw10data, over21==0),lines(agecell, pred, col="red", lty=3, lwd=3))
with(subset(hw10data, over21==1),lines(agecell, pred, col="chocolate1", lty=3, lwd=3))

OR, We can plot the fitted line for the entire sample:

plot(hw10data$agecell,hw10data$all, pch=17, frame.plot = FALSE,
     xaxs="i",yaxs="i",xlim=c(19,23), ylim=c(85,110),
     main = "A Simple RD regression  : \nFitted line across entire sample",
     xlab = "Age", ylab =" 'All' Deaths ", cex.main=1, font.main=1)
lines(hw10data$agecell, hw10data$pred, col="red", lty=2, lwd=2)

f. Allow more flexibility to your RD

Run a flexible RDD :

QUESTION: Run another RD where not only the intercept, but also the slope can differ between the under- and over-21 subsamples. Analyse the results. What is your estimate of interest. Does it differ from the previous one?

ANSWER:

flex.rd = lm(all ~ agecell+over21+I(agecell*over21), data = hw10data)
hw10data$flex.pred <- predict(flex.rd)
summary(flex.rd)


Call:
lm(formula = all ~ agecell + over21 + I(agecell * over21), data = hw10data)

Residuals:
   Min     1Q Median     3Q    Max 
-4.368 -1.787  0.117  1.108  5.341 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)          76.2515    16.3965   4.650 3.03e-05 ***
agecell               0.8270     0.8189   1.010  0.31809    
over21               83.3332    24.3568   3.421  0.00136 ** 
I(agecell * over21)  -3.6034     1.1581  -3.111  0.00327 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.283 on 44 degrees of freedom
Multiple R-squared:  0.6677,    Adjusted R-squared:  0.645 
F-statistic: 29.47 on 3 and 44 DF,  p-value: 1.325e-10

Anaysis:

Analysis: Here, we see what is reported looks different than before. But the estimate of the program is the same as before. The parameter of interest here, again , is ** “Over21” ** that measures the effect of the program but now it is combined in the interaction term with agecell. So, our marginal effect of being over21 is : \(\beta_2 +\beta_3 ([agecell==21] \times [over21==1])\)

83.3332 +(-3.6034*21)

[1] 7.6618

The estimate of the program is “same” as before.

g. Again, plot the data

QUESTION: Again, plot the data and the regression lines defined by the regression.

plot(hw10data$agecell,hw10data$all, 
     pch=10, frame.plot = FALSE,
     xaxs="i",yaxs="i",
     xlim=c(19,23), ylim=c(85,110),
     main = "Flexible RD regression : \nFitted line for subsamples",
     xlab = "Age", ylab =" 'All' Deaths ", 
     cex.main=1, font.main=1)
with(subset(hw10data, over21==0),lines(agecell, flex.pred, col="red", lty=1, lwd=3))
with(subset(hw10data, over21==1),lines(agecell, flex.pred, col="chocolate1", lty=1, lwd=3))

OR, We can plot the fitted line for the entire sample:

plot(hw10data$agecell,hw10data$all, 
     pch=10, frame.plot = FALSE,
     xaxs="i",yaxs="i",xlim=c(19,23), ylim=c(85,110),
     main = "Flexible RD regression: \nFitted line across entire sample",
     xlab = "Age", ylab =" 'All' Deaths ", cex.main=1, font.main=1)
lines(hw10data$agecell, hw10data$flex.pred, col="red", lty=1, lwd=3)

h. Compare to pre-packaged RDD with cutoff at 21 nd default options:

Run the RDestimate command

QUESTION: Use the rdd package. Run the RDestimate command on all~agecell, specifying your data and the cutoff option at 21.

library(rdd)

rdd_simple <- RDestimate(all~agecell, data = hw10data, cutpoint = 21)

Print the output of your regression.

summary(rdd_simple)


Call:
RDestimate(formula = all ~ agecell, data = hw10data, cutpoint = 21)

Type:
sharp 

Estimates:
           Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
LATE       1.6561     40            9.001     1.480       6.080    1.199e-09
Half-BW    0.8281     20            9.579     1.914       5.004    5.609e-07
Double-BW  3.3123     48            7.953     1.278       6.223    4.882e-10
              
LATE       ***
Half-BW    ***
Double-BW  ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

F-statistics:
           F      Num. DoF  Denom. DoF  p        
LATE       33.08  3         36          3.799e-10
Half-BW    29.05  3         16          2.078e-06
Double-BW  32.54  3         44          6.129e-11

Plot the output of your regression.

plot(rdd_simple)

Analysis:

QUESTION: Discuss the results. Do they differ from what you found? Why?

Here, we see that the coefficient called “LATE”, is our parameter of interest of the MLDA program on mortality.
We observe that by default the “RDD” package chooses a bandwidth of “1.6561” with a “triangular kernel”
The estimate of the treatment is much larger than the “simple RD” we ran above. We observe that deaths increase by 9.001 at the cutoff point of 21.
The larger increase we observe is because of the choice of bandwidth that looks at observations around the cutoff makes the treatment and control group “look similar”. With this RDD framework, we are able to better capture the effect of MLDA on mortality among those close to 21.
However, in our earlier analysis with the OLS regression in a simple RD design we used the entire sample of observations where the comparison between treatment and control becomes “less similar” with the use of the full sample. The full sample just reflects the average trend in mortality across the age cohorts.

i . Prepackaged linear RDD with our bandwidth and kernel choice:

Re-run an RDestimate command

QUESTION: Re-run an RDestimate command, this time adding the options kernel = “rectangular” and bw=2. ANSWER

library(rdd)
rdd_kernelbw <- RDestimate(all~agecell, data = hw10data, cutpoint = 21,
                           kernel = "rectangular", bw=2)

Output the results.

summary(rdd_kernelbw)


Call:
RDestimate(formula = all ~ agecell, data = hw10data, cutpoint = 21, 
    bw = 2, kernel = "rectangular")

Type:
sharp 

Estimates:
           Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
LATE       2          48            7.663     1.273       6.017    1.776e-09
Half-BW    1          24            9.753     1.902       5.128    2.929e-07
Double-BW  4          48            7.663     1.273       6.017    1.776e-09
              
LATE       ***
Half-BW    ***
Double-BW  ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

F-statistics:
           F      Num. DoF  Denom. DoF  p        
LATE       29.47  3         44          2.651e-10
Half-BW    16.82  3         20          2.167e-05
Double-BW  29.47  3         44          2.651e-10

Plot them.

plot(rdd_kernelbw)

Analysis:

QUESTION: Compare to previous output and discuss

Here, we see that with the larger badwidth we are essentially capturing the entire sample and our estimates are closer to when we ran the “simple RD” design in the begining. In an RDD framework, we always need to be balance our choice of large and small bandwidth. Larger bandwidth make the comparison between the treatment group and control group “less similar”.
Secondly, the rectangular kernel, is a uniform kernel that gives equal weight to all observations within the bandwidth. However, the default kernel choice in rdd is triangular that weights those very near the curtoff higher than those away from the cutoff.
Here, we see the standard error with LATE is smaller than the “default” RDD with bandwidth of 1.6. Hence, with a larger bandwidth we might gain smaller standard error but we lose on the true effect.
Changing bamdwidth can have 1) estimation implication where increasing bandwidth captures more observations and less standard errors and 2) validity implication. Larger bandwidth makes the treatment comparison cohorts less similar.

References :

R-mardown theme : https://github.com/juba/rmdformats/blob/master/resources/examples/readthedown/readthedown.Rmd
https://stats.stackexchange.com/questions/30975/how-to-add-non-linear-trend-line-to-a-scatter-plot-in-r
https://stackoverflow.com/questions/10007830/loess-line-not-plotting-correctly/10008228#10008228
https://www.r-graph-gallery.com/44-polynomial-curve-fitting.html
https://dev-ii-seminar.readthedocs.io/en/latest/notebooks/RDD_R.html
Introduction to Regression Discontinuity https://www.youtube.com/watch?v=uHNhhOq0jE4
An intuitive introduction to Regression Discontinuity https://youtu.be/tWRsYWSP3fM via @YouTube

Task 10: Regression Discontinuity

RDD analysing legal drinking age effect on mortality

Task 10: Regression Discontinuity

a. Download and read the paper:

b. Get the Carpenter & Dobkin and data:

Download Data

Load and summarise data

Create subset data with needed variables

Make Over21 dummy variable:

c. Reproduce Figure 3 of the paper

d. Simplest regression-discontinuity design

Run “Simple RDD”:

Anlyse the results?:

e. Plot the results

f. Allow more flexibility to your RD

Run a flexible RDD :

Anaysis:

g. Again, plot the data

h. Compare to pre-packaged RDD with cutoff at 21 nd default options:

Run the RDestimate command

Print the output of your regression.

Plot the output of your regression.

Analysis:

i . Prepackaged linear RDD with our bandwidth and kernel choice:

Re-run an RDestimate command

Output the results.

Plot them.

Analysis:

References :