a. Download and read the paper:

Assignment from the paper by Carpenter and Dobkin(2009) The Effect of Alcohol Consumption on Mortality: Regression Discontinuity Evidence from the Minimum Drinking Age AEJ: Applied Economics 1(1)164-82

b. Get the Carpenter & Dobkin data:

df <- read_dta("http://masteringmetrics.com/wp-content/uploads/2015/01/AEJfigs.dta")
df <- na.omit(df)
df %>% select(c(1:7)) %>% head()
agecellallallfittedinternalinternalfittedexternalexternalfitted
19.192.891.716.616.776.275  
19.295.191.918.316.976.875  
19.292.192  18.917.173.275  
19.388.492.216.117.372.374.9
19.488.792.317.417.471.374.9
19.590.292.517.917.672.374.9

Creating a dummy for the age of 21 and above

df$over21 <- ifelse(df$agecell >= 21,1,0)
df %>% select(c(1:7,20)) %>% head()
agecellallallfittedinternalinternalfittedexternalexternalfittedover21
19.192.891.716.616.776.275  0
19.295.191.918.316.976.875  0
19.292.192  18.917.173.275  0
19.388.492.216.117.372.374.90
19.488.792.317.417.471.374.90
19.590.292.517.917.672.374.90

c. Reproduce Figure 3 of the paper

gg <- df %>% select(c(1:7,20))%>%
  ggplot(aes(x=agecell)) +
  geom_point(aes(y=all,shape=18))+
  geom_point(aes(y=internal, shape=0))+
  geom_point(aes(y=external, shape=17))+
  geom_line(aes(y=allfitted, linetype = "dotted")) + 
  geom_line(aes(y=internalfitted, linetype = "dashed")) + 
  geom_line(aes(y=externalfitted, linetype = "solid"))+
  scale_shape_identity()+
  scale_y_continuous(limits = c(0,120), breaks=seq(0,120,20))+
  geom_vline(xintercept = 21, color = "red", linetype = "dashed", size=1)+
  labs(x="Age", y="Deaths per 100,000 person-years",title = "AGE PROFILE FOR DEATH RATES")+
  theme_classic()+
  theme(legend.justification=c(1,0), legend.position="bottom",legend.title = element_blank())+
  guides(color=guide_legend(override.aes=list(shape =15),nrow=2,byrow=TRUE))

gg + 
  scale_linetype(breaks =c("dashed","dotted","solid"),
                    labels = c("External fitted", "All fitted", "Internal fitted"))

d. Simplest regression-discontinuity design

Run a simple RD regression (same slope, with a jump) for the variable “all” deaths by age.

model1 <- lm(all ~ agecell, data=df)
df$fit_model1 <- predict(model1,df)
huxtable <- huxreg(model1, 
                   coefs = c("Age"="agecell"),
                   statistics = c("N. obs." = "nobs","R squared" = "r.squared"))%>%
  set_caption("Table 1:Simplest RD design")
huxtable
Table 1:Simplest RD design
(1)
Age1.940 ***
(0.399)   
N. obs.48        
R squared0.340    
*** p < 0.001; ** p < 0.01; * p < 0.05.

Mortality increases with Age and Age seems to be significant. We dont get much information from this other than the linear relationship between the variables.

e. Plot the results from Simplest regression-discontinuity design

df %>% ggplot(aes(x=agecell, y=all, color=factor(over21))) +
  geom_point()+
  geom_smooth(method = "lm", alpha = .15)+
  labs(x="Age", y="Deaths per 100,000 person-years",title = "Simplest RD: AGE PROFILE FOR DEATH RATES")+
  geom_vline(xintercept = 21, color = "red", linetype = "dashed", size=1)+
  theme_classic()+
  theme(legend.justification=c(1,0), legend.position=c(1,0),legend.title=element_blank())+
  scale_colour_discrete(limits = c("0", "1"),labels = c("Under 21", "21 and above"))

Mortality increases with Age and Age seems to be significant. We dont get much information from this other than the linear relationship between the variables.

f. Allowing flexibility to RD design by including the interaction

model2 <- lm(all ~ agecell +over21 +agecell*over21 , data=df)
df$fit_model2 <- predict(model2, df)
huxtable <- huxreg(model2, 
                   statistics = c("N. obs." = "nobs","R squared" = "r.squared"))%>%
  set_caption("Table 2: Flexible RDD")
huxtable
Table 2: Flexible RDD
(1)
(Intercept)76.251 ***
(16.396)   
agecell0.827    
(0.819)   
over2183.333 ** 
(24.357)   
agecell:over21-3.603 ** 
(1.158)   
N. obs.48        
R squared0.668    
*** p < 0.001; ** p < 0.01; * p < 0.05.

Comparing Simple and Flexible RDD

model_tbl1 = list("Simple RD (1)" = model1,"Flexi RD (2)" = model2)


coefs <- names(coef(model_tbl1[[1]]))[str_detect(names(coef(model_tbl1[[1]])), "vdc")]


huxtable <- huxreg(model_tbl1,number_format = 3, omit_coefs =coefs,
       coefs = c("Age"="agecell","Over 21"="over21","Age:Over 21"="agecell:over21"),
      statistics = c("Sample size:" = "nobs", "R2" = "r.squared"))%>% 
  set_caption("Table 3: Compare Simple and Flexi RD")
huxtable
Table 3: Compare Simple and Flexi RD
Simple RD (1)Flexi RD (2)
Age1.940 ***0.827   
(0.399)   (0.819)  
Over 21        83.333 **
        (24.357)  
Age:Over 21        -3.603 **
        (1.158)  
Sample size:48        48       
R20.340    0.668   
*** p < 0.001; ** p < 0.01; * p < 0.05.

Interaction term is significant and the coefficient of Over 21 is high indicating flexible RD is unbiased as it takes into account death over the period (the non-linearity around the threshold of 21 years).

g. Plot the results from flexible RD design

df %>% ggplot(aes(x=agecell, y=all, color=factor(over21))) +
  geom_point()+
  geom_smooth(method = "lm", alpha = .15)+
  labs(x="Age", y="Deaths per 100,000 person-years",title = "Flexible Regression Discontinuity Plot")+
  geom_vline(xintercept = 21, color = "red", linetype = "dashed", size=1)+
  theme_classic()+
  theme(legend.justification=c(1,0), legend.position=c(1,0),legend.title = element_blank())+
  scale_colour_discrete(limits = c("0", "1"),labels = c("Under 21", "21 and above"))

h. Comparing to pre-packaged RDD

model3 <- RDestimate(all ~ agecell, cutpoint = 21, data=df)
summary(model3)
## 
## Call:
## RDestimate(formula = all ~ agecell, data = df, cutpoint = 21)
## 
## Type:
## sharp 
## 
## Estimates:
##            Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
## LATE       1.6561     40            9.001     1.480       6.080    1.199e-09
## Half-BW    0.8281     20            9.579     1.914       5.004    5.609e-07
## Double-BW  3.3123     48            7.953     1.278       6.223    4.882e-10
##               
## LATE       ***
## Half-BW    ***
## Double-BW  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## F-statistics:
##            F      Num. DoF  Denom. DoF  p        
## LATE       33.08  3         36          3.799e-10
## Half-BW    29.05  3         16          2.078e-06
## Double-BW  32.54  3         44          6.129e-11
plot(model3)
title(main = "Pre-packaged RDD",
     xlab = "Age", ylab="Deaths per 100,000 person-years")
abline(v=21,lwd=2,lty="dashed",col="red")

We can see the sharp discontinuity around the threshold of 21 which indicates the higher Average Treatment Effect. #### i. Comparing to pre-packaged RDD with “rectangular” kernel and bw=2

model4 <- RDestimate(all ~ agecell, cutpoint = 21, kernel = "rectangular", bw=2 ,data=df)
summary(model4)
## 
## Call:
## RDestimate(formula = all ~ agecell, data = df, cutpoint = 21, 
##     bw = 2, kernel = "rectangular")
## 
## Type:
## sharp 
## 
## Estimates:
##            Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|) 
## LATE       2          48            7.663     1.273       6.017    1.776e-09
## Half-BW    1          24            9.753     1.902       5.128    2.929e-07
## Double-BW  4          48            7.663     1.273       6.017    1.776e-09
##               
## LATE       ***
## Half-BW    ***
## Double-BW  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## F-statistics:
##            F      Num. DoF  Denom. DoF  p        
## LATE       29.47  3         44          2.651e-10
## Half-BW    16.82  3         20          2.167e-05
## Double-BW  29.47  3         44          2.651e-10
plot(model4)
title(main = "Pre-packaged RDD with rectangular kernaland bw=2",
     xlab = "Age", ylab="Deaths per 100,000 person-years")
abline(v=21,lwd=2,lty="dashed",col="red")

Sharp discontinuity around the threshold is evident however the standard error has reduced now indicating a better choice of bw. Kernal argument is specifically used to weight the observations based on their distaance from the threshold. Rectangular means no weightage.