Assignment from the paper by Carpenter and Dobkin(2009) The Effect of Alcohol Consumption on Mortality: Regression Discontinuity Evidence from the Minimum Drinking Age AEJ: Applied Economics 1(1)164-82
df <- read_dta("http://masteringmetrics.com/wp-content/uploads/2015/01/AEJfigs.dta")
df <- na.omit(df)
df %>% select(c(1:7)) %>% head()
agecell | all | allfitted | internal | internalfitted | external | externalfitted |
---|---|---|---|---|---|---|
19.1 | 92.8 | 91.7 | 16.6 | 16.7 | 76.2 | 75 |
19.2 | 95.1 | 91.9 | 18.3 | 16.9 | 76.8 | 75 |
19.2 | 92.1 | 92 | 18.9 | 17.1 | 73.2 | 75 |
19.3 | 88.4 | 92.2 | 16.1 | 17.3 | 72.3 | 74.9 |
19.4 | 88.7 | 92.3 | 17.4 | 17.4 | 71.3 | 74.9 |
19.5 | 90.2 | 92.5 | 17.9 | 17.6 | 72.3 | 74.9 |
df$over21 <- ifelse(df$agecell >= 21,1,0)
df %>% select(c(1:7,20)) %>% head()
agecell | all | allfitted | internal | internalfitted | external | externalfitted | over21 |
---|---|---|---|---|---|---|---|
19.1 | 92.8 | 91.7 | 16.6 | 16.7 | 76.2 | 75 | 0 |
19.2 | 95.1 | 91.9 | 18.3 | 16.9 | 76.8 | 75 | 0 |
19.2 | 92.1 | 92 | 18.9 | 17.1 | 73.2 | 75 | 0 |
19.3 | 88.4 | 92.2 | 16.1 | 17.3 | 72.3 | 74.9 | 0 |
19.4 | 88.7 | 92.3 | 17.4 | 17.4 | 71.3 | 74.9 | 0 |
19.5 | 90.2 | 92.5 | 17.9 | 17.6 | 72.3 | 74.9 | 0 |
gg <- df %>% select(c(1:7,20))%>%
ggplot(aes(x=agecell)) +
geom_point(aes(y=all,shape=18))+
geom_point(aes(y=internal, shape=0))+
geom_point(aes(y=external, shape=17))+
geom_line(aes(y=allfitted, linetype = "dotted")) +
geom_line(aes(y=internalfitted, linetype = "dashed")) +
geom_line(aes(y=externalfitted, linetype = "solid"))+
scale_shape_identity()+
scale_y_continuous(limits = c(0,120), breaks=seq(0,120,20))+
geom_vline(xintercept = 21, color = "red", linetype = "dashed", size=1)+
labs(x="Age", y="Deaths per 100,000 person-years",title = "AGE PROFILE FOR DEATH RATES")+
theme_classic()+
theme(legend.justification=c(1,0), legend.position="bottom",legend.title = element_blank())+
guides(color=guide_legend(override.aes=list(shape =15),nrow=2,byrow=TRUE))
gg +
scale_linetype(breaks =c("dashed","dotted","solid"),
labels = c("External fitted", "All fitted", "Internal fitted"))
Run a simple RD regression (same slope, with a jump) for the variable “all” deaths by age.
model1 <- lm(all ~ agecell, data=df)
df$fit_model1 <- predict(model1,df)
huxtable <- huxreg(model1,
coefs = c("Age"="agecell"),
statistics = c("N. obs." = "nobs","R squared" = "r.squared"))%>%
set_caption("Table 1:Simplest RD design")
huxtable
(1) | |
---|---|
Age | 1.940 *** |
(0.399) | |
N. obs. | 48 |
R squared | 0.340 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
Mortality increases with Age and Age seems to be significant. We dont get much information from this other than the linear relationship between the variables.
df %>% ggplot(aes(x=agecell, y=all, color=factor(over21))) +
geom_point()+
geom_smooth(method = "lm", alpha = .15)+
labs(x="Age", y="Deaths per 100,000 person-years",title = "Simplest RD: AGE PROFILE FOR DEATH RATES")+
geom_vline(xintercept = 21, color = "red", linetype = "dashed", size=1)+
theme_classic()+
theme(legend.justification=c(1,0), legend.position=c(1,0),legend.title=element_blank())+
scale_colour_discrete(limits = c("0", "1"),labels = c("Under 21", "21 and above"))
Mortality increases with Age and Age seems to be significant. We dont
get much information from this other than the linear relationship
between the variables.
model2 <- lm(all ~ agecell +over21 +agecell*over21 , data=df)
df$fit_model2 <- predict(model2, df)
huxtable <- huxreg(model2,
statistics = c("N. obs." = "nobs","R squared" = "r.squared"))%>%
set_caption("Table 2: Flexible RDD")
huxtable
(1) | |
---|---|
(Intercept) | 76.251 *** |
(16.396) | |
agecell | 0.827 |
(0.819) | |
over21 | 83.333 ** |
(24.357) | |
agecell:over21 | -3.603 ** |
(1.158) | |
N. obs. | 48 |
R squared | 0.668 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
model_tbl1 = list("Simple RD (1)" = model1,"Flexi RD (2)" = model2)
coefs <- names(coef(model_tbl1[[1]]))[str_detect(names(coef(model_tbl1[[1]])), "vdc")]
huxtable <- huxreg(model_tbl1,number_format = 3, omit_coefs =coefs,
coefs = c("Age"="agecell","Over 21"="over21","Age:Over 21"="agecell:over21"),
statistics = c("Sample size:" = "nobs", "R2" = "r.squared"))%>%
set_caption("Table 3: Compare Simple and Flexi RD")
huxtable
Simple RD (1) | Flexi RD (2) | |
---|---|---|
Age | 1.940 *** | 0.827 |
(0.399) | (0.819) | |
Over 21 | 83.333 ** | |
(24.357) | ||
Age:Over 21 | -3.603 ** | |
(1.158) | ||
Sample size: | 48 | 48 |
R2 | 0.340 | 0.668 |
*** p < 0.001; ** p < 0.01; * p < 0.05. |
Interaction term is significant and the coefficient of Over 21 is high indicating flexible RD is unbiased as it takes into account death over the period (the non-linearity around the threshold of 21 years).
df %>% ggplot(aes(x=agecell, y=all, color=factor(over21))) +
geom_point()+
geom_smooth(method = "lm", alpha = .15)+
labs(x="Age", y="Deaths per 100,000 person-years",title = "Flexible Regression Discontinuity Plot")+
geom_vline(xintercept = 21, color = "red", linetype = "dashed", size=1)+
theme_classic()+
theme(legend.justification=c(1,0), legend.position=c(1,0),legend.title = element_blank())+
scale_colour_discrete(limits = c("0", "1"),labels = c("Under 21", "21 and above"))
model3 <- RDestimate(all ~ agecell, cutpoint = 21, data=df)
summary(model3)
##
## Call:
## RDestimate(formula = all ~ agecell, data = df, cutpoint = 21)
##
## Type:
## sharp
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 1.6561 40 9.001 1.480 6.080 1.199e-09
## Half-BW 0.8281 20 9.579 1.914 5.004 5.609e-07
## Double-BW 3.3123 48 7.953 1.278 6.223 4.882e-10
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 33.08 3 36 3.799e-10
## Half-BW 29.05 3 16 2.078e-06
## Double-BW 32.54 3 44 6.129e-11
plot(model3)
title(main = "Pre-packaged RDD",
xlab = "Age", ylab="Deaths per 100,000 person-years")
abline(v=21,lwd=2,lty="dashed",col="red")
We can see the sharp discontinuity around the threshold of 21 which indicates the higher Average Treatment Effect. #### i. Comparing to pre-packaged RDD with “rectangular” kernel and bw=2
model4 <- RDestimate(all ~ agecell, cutpoint = 21, kernel = "rectangular", bw=2 ,data=df)
summary(model4)
##
## Call:
## RDestimate(formula = all ~ agecell, data = df, cutpoint = 21,
## bw = 2, kernel = "rectangular")
##
## Type:
## sharp
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 2 48 7.663 1.273 6.017 1.776e-09
## Half-BW 1 24 9.753 1.902 5.128 2.929e-07
## Double-BW 4 48 7.663 1.273 6.017 1.776e-09
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 29.47 3 44 2.651e-10
## Half-BW 16.82 3 20 2.167e-05
## Double-BW 29.47 3 44 2.651e-10
plot(model4)
title(main = "Pre-packaged RDD with rectangular kernaland bw=2",
xlab = "Age", ylab="Deaths per 100,000 person-years")
abline(v=21,lwd=2,lty="dashed",col="red")
Sharp discontinuity around the threshold is evident however the standard error has reduced now indicating a better choice of bw. Kernal argument is specifically used to weight the observations based on their distaance from the threshold. Rectangular means no weightage.