# Load STATA file using the foreign package
library(foreign)
# Import dta data
my_data <- read.dta("AEJfigs.dta")
summary(my_data)
## agecell all allfitted internal
## Min. :19.07 Min. : 88.43 Min. : 91.71 Min. :15.98
## 1st Qu.:20.08 1st Qu.: 92.79 1st Qu.: 93.04 1st Qu.:18.60
## Median :21.00 Median : 95.69 Median : 95.18 Median :20.29
## Mean :21.00 Mean : 95.67 Mean : 95.80 Mean :20.29
## 3rd Qu.:21.92 3rd Qu.: 98.03 3rd Qu.: 97.79 3rd Qu.:21.98
## Max. :22.93 Max. :105.27 Max. :102.89 Max. :24.37
## NA's :2 NA's :2
## internalfitted external externalfitted alcohol
## Min. :16.74 Min. :71.34 Min. :73.16 Min. :0.6391
## 1st Qu.:18.67 1st Qu.:73.04 1st Qu.:74.06 1st Qu.:0.9962
## Median :20.54 Median :74.81 Median :74.74 Median :1.2119
## Mean :20.28 Mean :75.39 Mean :75.52 Mean :1.2573
## 3rd Qu.:21.66 3rd Qu.:77.24 3rd Qu.:76.06 3rd Qu.:1.4701
## Max. :24.04 Max. :83.33 Max. :81.78 Max. :2.5193
## NA's :2 NA's :2
## alcoholfitted homicide homicidefitted suicide
## Min. :0.7943 Min. :14.95 Min. :16.26 Min. :10.89
## 1st Qu.:1.0724 1st Qu.:16.61 1st Qu.:16.54 1st Qu.:11.61
## Median :1.2471 Median :16.99 Median :16.99 Median :12.20
## Mean :1.2674 Mean :16.91 Mean :16.95 Mean :12.35
## 3rd Qu.:1.4455 3rd Qu.:17.29 3rd Qu.:17.25 3rd Qu.:12.82
## Max. :1.8174 Max. :18.41 Max. :17.76 Max. :14.83
## NA's :2 NA's :2
## suicidefitted mva mvafitted drugs
## Min. :11.59 Min. :26.86 Min. :27.87 Min. :3.202
## 1st Qu.:11.61 1st Qu.:30.12 1st Qu.:30.17 1st Qu.:3.755
## Median :12.25 Median :31.64 Median :31.73 Median :4.314
## Mean :12.36 Mean :31.62 Mean :31.68 Mean :4.250
## 3rd Qu.:13.04 3rd Qu.:33.10 3rd Qu.:33.40 3rd Qu.:4.756
## Max. :13.55 Max. :36.39 Max. :34.82 Max. :5.565
## NA's :2 NA's :2
## drugsfitted externalother externalotherfitted
## Min. :3.449 Min. : 7.973 Min. : 8.388
## 1st Qu.:3.769 1st Qu.: 9.149 1st Qu.: 9.347
## Median :4.323 Median : 9.561 Median : 9.690
## Mean :4.255 Mean : 9.599 Mean : 9.610
## 3rd Qu.:4.679 3rd Qu.:10.122 3rd Qu.: 9.939
## Max. :5.130 Max. :11.483 Max. :10.353
## NA's :2
# Create over21 dummy variable: 1 if over 21, else 0
my_data$over21 = ifelse(my_data$agecell >= 21,1,0)
# fig 3 of paper: plot points first
plot(my_data$agecell, my_data$all, col = "gray", pch = 18,
ylim = c(0, 120), xlab = "Age", ylab = "Deaths per 100,000 person-years")
points(my_data$agecell, my_data$internal, pch = 22, cex = 0.6, col = "darkgray")
points(my_data$agecell, my_data$external, pch = 17, cex = 0.6, col = "black")
# For lines, plot below the cutoff and above the cutoff
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$allfitted, NA),
lty = 3, lwd = 3, col = "darkgray")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$allfitted, NA),
lty = 3, lwd = 3, col = "darkgray")
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$internalfitted, NA),
lty = 5, col = "darkgrey")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$internalfitted, NA),
lty = 5, col = "darkgrey")
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$externalfitted, NA),
lty = 1, col = "black")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$externalfitted, NA),
lty = 1, col = "black")
legend(21, 60, lty = c(0,0,0,3,3,5,5,1,1), pch = c(18,22,17,NA,NA,NA,NA,NA,NA),
col = c("gray","darkgray","black","darkgray","darkgrey","darkgrey","darkgrey","black","black"),
legend = c("All","Internal","External",
"All fitted", "Internal fitted", "External fitted"),
ncol = 2, cex = 0.6)
library(stargazer)
# RDD
model <- lm(all ~ agecell + over21, data = my_data)
# Create a table with stargazer package
stargazer(model, type = "text", title = "Regression-Discontinuity Design",
align = TRUE, keep.stat = c("n","rsq"),
dep.var.labels = c("Discontinuity in Deaths"), covariate.labels = c("agecell", "over21"))
##
## Regression-Discontinuity Design
## ========================================
## Dependent variable:
## ---------------------------
## Discontinuity in Deaths
## ----------------------------------------
## agecell -0.975
## (0.632)
##
## over21 7.663***
## (1.440)
##
## Constant 112.310***
## (12.668)
##
## ----------------------------------------
## Observations 48
## R2 0.595
## ========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
This model can be expressed as:
\[
AllDeaths_{i} = \alpha + \beta(AgeCell_i) + \gamma(Over21_i) + \epsilon_{i}
\] The table above is the relationship between drinking and mortality. The simple RD regression shows a significant jump in deaths at age 21. The estimated value indicates the effects of alcohol drinking on deaths.
# First estimate the fitted value
my_data$modelFit <- predict(model, my_data)
# Then, plot them based on the cutoff point
plot(my_data$agecell, my_data$all, col = "black", pch = 18, ylim = c(85, 110),
xlab = "Age", ylab = "Deaths per 100,000 person-years", main = "RDD results")
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$modelFit, NA),
lty = 1, lwd = 3, col = "red")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$modelFit, NA),
lty = 1, lwd = 3, col = "red")
legend("bottomright", lty=c(0,1,1), pch=c(18,NA,NA), col = c("black", "red", "red"),
legend = c("All", "Fitted"), cex = 0.6)
# Run the model
model1 = lm(all ~ agecell + over21 + agecell*over21, data = my_data)
stargazer(model1, type = "text", title = "Regression-Discontinuity Design", align = TRUE,
keep.stat = c("n","rsq"), dep.var.labels = c("Discontinuity in Deaths"),
covariate.labels = c("agecell", "over21", "agecell * over21"))
##
## Regression-Discontinuity Design
## ============================================
## Dependent variable:
## ---------------------------
## Discontinuity in Deaths
## --------------------------------------------
## agecell 0.827
## (0.819)
##
## over21 83.333***
## (24.357)
##
## agecell * over21 -3.603***
## (1.158)
##
## Constant 76.251***
## (16.396)
##
## --------------------------------------------
## Observations 48
## R2 0.668
## ============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
This model can be expressed as:
\[
AllDeaths_{i} = \alpha + \beta_1(AgeCell_i) + \beta_2(Over21_i) + \beta_3(AgeCell_i)*(Over21_i) + \epsilon_{i}
\]
# Estimate
estimate <- summary(model1)$coefficients[3, 1] +
summary(model1)$coefficients[4,1] * 21
Here, the estimate of interest is \(\beta_2 + \beta_3(AgeCell_i)*(Over21_i)\) ( = 7.663, which is calculated above). The estimate does not differ from the previous one.
# First estimate the fitted value
my_data$modelFit1 <- predict(model1, my_data)
# Then, plot them based on the cutoff point
plot(my_data$agecell, my_data$all, col = "black", pch = 18, ylim = c(85, 110),
xlab = "Age", ylab = "Deaths per 100,000 person-years", main = "RDD results")
lines(my_data$agecell, ifelse(my_data$over21==0, my_data$modelFit1, NA),
lty = 1, lwd = 3, col = "red")
lines(my_data$agecell, ifelse(my_data$over21==1, my_data$modelFit1, NA),
lty = 1, lwd = 3, col = "red")
legend("bottomright", lty=c(0,1,1), pch=c(18,NA,NA), col = c("black", "red", "red"),
legend = c("All", "Fitted"), cex = 0.6)
# Using rdd package
library(rdd)
model2 <- RDestimate(all~agecell, data = my_data, cutpoint = 21)
summary(model2)
##
## Call:
## RDestimate(formula = all ~ agecell, data = my_data, cutpoint = 21)
##
## Type:
## sharp
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 1.6561 40 9.001 1.480 6.080 1.199e-09
## Half-BW 0.8281 20 9.579 1.914 5.004 5.609e-07
## Double-BW 3.3123 48 7.953 1.278 6.223 4.882e-10
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 33.08 3 36 3.799e-10
## Half-BW 29.05 3 16 2.078e-06
## Double-BW 32.54 3 44 6.129e-11
# Plot output of regression
plot(model2)
title(main = "RDD with default kernel and bandwidth")
Here, the local average treatment effect is roughly 9, which is higher than what we obtained from the regression above. This can be owed to the default triangular kernel and bandwidth of 1.6. We can argue that with triangular kernel and default bandwidth we were able to better estimate the treatment effect.
# rerun the model with rectangular kernel and bandwidth 2
model3 <- RDestimate(all~agecell, data = my_data, cutpoint = 21, kernel = "rectangular", bw = 2)
summary(model3)
##
## Call:
## RDestimate(formula = all ~ agecell, data = my_data, cutpoint = 21,
## bw = 2, kernel = "rectangular")
##
## Type:
## sharp
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 2 48 7.663 1.273 6.017 1.776e-09
## Half-BW 1 24 9.753 1.902 5.128 2.929e-07
## Double-BW 4 48 7.663 1.273 6.017 1.776e-09
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 29.47 3 44 2.651e-10
## Half-BW 16.82 3 20 2.167e-05
## Double-BW 29.47 3 44 2.651e-10
# Plot output of regression
plot(model3)
title(main = "RDD with rectangular kernel and higher bandwidth")
Here, the local average treatment effect is around 7.66, which is similar to the one we obtained from the regression above. We used rectangular kernel and higher bandwidth of 2 and thus can argue that with these options we have smaller standard error compared to that in part h above but the estimation does not improve.