For instance, from the AEA website.
#1.1.a. Download and read the paper:
#1.1.b. Get the Carpenter & Dobkin and data:
Can be downloaded straight here http://masteringmetrics.com/wp-content/uploads/2015/01/AEJfigs.dta, from A&P Mastering Metrics resources webpage Look over the data, understand what the different variables are. The key variables you need are agecell, all, internal, external, as well as the versions of those variables that have the suffix fitted. Make an over21 dummy to help with the following analysis.
# Loading packages
knitr::opts_chunk$set(echo = TRUE, eval=TRUE, message=FALSE, warning=FALSE, fig.height=4)
necessaryPackages <- c("foreign","reshape","rvest","tidyverse","dplyr","stringr","ggplot2", "stargazer","readr", "haven")
new.packages <- necessaryPackages[
!(necessaryPackages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(necessaryPackages, require, character.only = TRUE)
## Loading required package: foreign
## Loading required package: reshape
## Loading required package: rvest
## Loading required package: xml2
## Loading required package: tidyverse
## ── Attaching packages ────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.0 ✓ dplyr 1.0.2
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
## x dplyr::rename() masks reshape::rename()
## Loading required package: stargazer
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
## Loading required package: haven
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] TRUE
##
## [[5]]
## [1] TRUE
##
## [[6]]
## [1] TRUE
##
## [[7]]
## [1] TRUE
##
## [[8]]
## [1] TRUE
##
## [[9]]
## [1] TRUE
##
## [[10]]
## [1] TRUE
#Load dataset
df_alcohol <- read_dta("/Users/twinkleroy/Downloads/AEJfigs.dta")
#50 observations with 19 Variables
head(df_alcohol, 6) #the first six rows of the data
## # A tibble: 6 x 19
## agecell all allfitted internal internalfitted external externalfitted
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 19.1 92.8 91.7 16.6 16.7 76.2 75.0
## 2 19.2 95.1 91.9 18.3 16.9 76.8 75.0
## 3 19.2 92.1 92.0 18.9 17.1 73.2 75.0
## 4 19.3 88.4 92.2 16.1 17.3 72.3 74.9
## 5 19.4 88.7 92.3 17.4 17.4 71.3 74.9
## 6 19.5 90.2 92.5 17.9 17.6 72.3 74.9
## # … with 12 more variables: alcohol <dbl>, alcoholfitted <dbl>, homicide <dbl>,
## # homicidefitted <dbl>, suicide <dbl>, suicidefitted <dbl>, mva <dbl>,
## # mvafitted <dbl>, drugs <dbl>, drugsfitted <dbl>, externalother <dbl>,
## # externalotherfitted <dbl>
#1.1.c. Reproduce Figure 3 of the paper
No need to bother trying to get their exact formatting here (we are past that stage of the course). Just make sure you show the data and the discontinuities.
library(tidyr)
library(dplyr)
library(ggplot2)
#Reshape the data for the plot
df1 <- df_alcohol %>%
select(agecell, all, allfitted, internal, internalfitted, external, externalfitted) %>%
gather(key = "variable", value = "value", -agecell)
#head(df1, 3) #View the first three rows of the new data frame
# Multiple plotS using points for the fitted lines
Figure3 <- ggplot(df1, aes(x = agecell, y = value)) +
geom_point(aes(color = variable)) +
scale_x_continuous(breaks=seq(19, 23, 0.5),limits=c(19,23),expand=c(0,0)) +
scale_color_manual(values=c("#CCCCCC","#666666","#999999", "#000000", "#99CCFF", "#0000FF"),
name="variable",
labels=c("All", "All fitted", "Internal", "Internal fitted", "External", "External fitted"))+
labs(title= "FIGURE 3. AGE PROFILE FOR DEATH RATES", x = "Age",
y = "Deaths per 100,000 person-years") + theme(panel.background = element_blank()) +
theme(plot.title = element_text(hjust = 1)) +scale_y_continuous(breaks=seq(0,120,20),limits=c(0,120),expand=c(0,0)) +
theme(axis.line.x = element_line(color="black", size = 0.5),
axis.line.y = element_line(color="black", size = 0.5), axis.text.y = element_text(colour = "black"), panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "white",size=0.75)) + theme(panel.border= element_blank()) +
theme(legend.background = element_rect(size=0.5, linetype="solid", colour ="black"), legend.direction = "vertical", legend.title = element_blank()) + guides(col =guide_legend(nrow =3))
Figure3
#The multiplot for the RDD using lines for the fitted lines
df <- df_alcohol %>%
select(agecell, all, internal, external) %>%
gather(key = "variable", value = "value", -agecell)
df_fitted <-df_alcohol %>%
select(allfitted, internalfitted, externalfitted) %>%
gather(key = "fittedvariable", value = "fittedvalue")
df_together <- cbind(df, df_fitted)
Figure3a <- ggplot(df_together, aes(x = agecell, y = value)) +
geom_point(aes(color = variable)) +
geom_line(data = df_together, aes(y = fittedvalue, color =fittedvariable), size = 1) +
scale_color_manual(values=c("#CCCCCC","#666666","#999999", "#000000", "#99CCFF", "#0000FF"),
name="variable",
labels=c("All", "All fitted", "Internal", "Internal fitted", "External", "External fitted"))+
labs(title= "FIGURE 3. AGE PROFILE FOR DEATH RATES", x = "Age",
y = "Deaths per 100,000 person-years") + theme(panel.background = element_blank()) +
theme(plot.title = element_text(hjust = 1)) +
scale_y_continuous(breaks=seq(0,120,20),limits=c(0,120),expand=c(0,0)) +
theme(axis.line.x = element_line(color="black", size = 0.5),
axis.line.y = element_line(color="black", size = 0.5),
axis.text.y = element_text(colour = "black"), panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "white",size=0.75)) +
theme(panel.border= element_blank()) +
theme(legend.background = element_rect(size=0.5, linetype="solid", colour ="black"),
legend.direction = "vertical", legend.title = element_blank()) +
guides(col =guide_legend(nrow =3))
Figure3a
#1.1.d. Simplest regression-discontinuity design
Run a simple RD regression (same slope, with a jump) for “all” deaths by age (I don’t mean all the variables, just the one variable that is called “all”). Note: You are NOT trying to reproduce the paper’s tables. That is because you only have access to a collapsed dataset with <50 observations. Don’t expect to closely reproduce the results of the tables in the paper, which uses a full 1500 observations dataset. Also, you don’t have the same variables either
Analyse the results. How do you use these results to estimate the relationship between drinking and mortality?
library(tidyverse)
library(stargazer)
#create the dummy variable over21
df_alcohol <- df_alcohol %>%
mutate(over21 = as.factor(ifelse(agecell >= 21, 1, 0)))
# model with the same slope with a jump
model <- lm(all ~ over21 + I(agecell - 21), data=df_alcohol)
#This regression above can also be run using the rddtools package library(rddtools)
#simple RD model without applying same slope with a jump
model1 <- lm(all~agecell+over21, data=df_alcohol)
stargazer(model1, model, type= "text", no.space =TRUE, keep.stat = c("n", "adj.rsq", "rsq"), title="simple RD regression of drinking on mortality", column.labels = c("no jump applied", "jump applied"))
##
## simple RD regression of drinking on mortality
## ============================================
## Dependent variable:
## ----------------------------
## all
## no jump applied jump applied
## (1) (2)
## --------------------------------------------
## agecell -0.975
## (0.632)
## over211 7.663*** 7.663***
## (1.440) (1.440)
## I(agecell - 21) -0.975
## (0.632)
## Constant 112.310*** 91.841***
## (12.668) (0.805)
## --------------------------------------------
## Observations 48 48
## R2 0.595 0.595
## Adjusted R2 0.577 0.577
## ============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Under the usual assumptions for RDD estimations, _1 can be expressed in a small -neighborhood around the cutoff age of 21 as follows:
\[ \alpha_{1}=\mathop{\mathbb{E}}[all|21 \le agecell \le 21+\delta, over21=1]-\mathop{\mathbb{E}}[all|21-\delta \le agecell \le 21, over21=0]. \]
Coefficient of interest in this regression is the one for over21 and it can be interpreted as the change in death at age 21 for all causes of death. To be specific for this estimate result above, the coefficient of over21 is 7.663, this means that for all death (both internal and external) related to alcohol consumption increases by about 8 deaths per 100,000 person years at age 21. This similar to the result in the paper, although with varying number. Also, in the paper they used log of death while in the regression above.
The results implies that alcohol consumption has a positive impact on mortality i.e. alcohol consumption causes increase in mortality at age 21 by about 8 death per 100,000 person. The coefficient is positive and significant at 1% significant level.
#1.1.e. Plot the results
Plot the all variable by age, and add the regression lines defined by your regression output (It’s ok if the lines extend throughout the whole figure. Don’t worry too much about formatting.)
#get the predicted value
df_alcohol$fittedvalue <- predict(model, df_alcohol)
# plotting fit
fig1 <- plot(df_alcohol$fittedvalue~df_alcohol$agecell,type="l",ylim=range(85,110),
col="red",lwd=2, xlab="Age",
ylab= "Death per 100,000 person-years",
main = "Death rates from all causes against age")
# adding the data points
points(df_alcohol$all~df_alcohol$agecell)
#Using basic plot including the regression line allowing for the discontinuity
fig1.1 <- plot(df_alcohol$fittedvalue~df_alcohol$agecell, ylim = range(85,110),
col ="red", lwd=2, xlab="Age",
ylab = "Death per 100,000 person-years",
main = "Death rates from all causes against age")
points(all~agecell, data=df_alcohol)
#use ggplot2 for the plot of the data and the regression line
fig1.2 <- ggplot(df_alcohol, aes(x=agecell, y=fittedvalue, colour=over21))+ geom_line() +
geom_point(aes(x=df_alcohol$agecell, y=df_alcohol$all)) +
scale_x_continuous(breaks=seq(19, 23, 1),limits=c(19,23),expand=c(0,0)) +
scale_y_continuous(breaks=seq(80,110,5),limits=c(80,110),expand=c(0,0)) +
scale_color_manual(values=c("#3300FF","#336633"),
name="variable",
labels=c("Below 21", "Above 21")) +
labs(title= "Death rates from all causes against age", x = "Age",
y = "Deaths per 100,000 person-years") + theme(panel.background = element_blank()) + theme(plot.title = element_text(hjust = 1)) +theme(axis.line.x = element_line(color="black", size = 0.5),
axis.line.y = element_line(color="black", size = 0.5), axis.text.y = element_text(colour = "black"), panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "white",size=0.75)) + theme(panel.border= element_blank()) + theme(
plot.title = element_text(hjust = 0.5, size = 10, face="bold")) +
theme(legend.background = element_rect(size=0.5, linetype="solid", colour ="black"), legend.direction = "vertical", legend.title = element_blank()) + guides(col =guide_legend(nrow =3))
fig1.2
#######################This is the plot of the data agecell and all using loess smoother for age below and above 21
#use ggplot2 for the plot
#plot for age below and above 21
library(ggplot2)
fig1.3 <- ggplot(df_alcohol, aes(x = agecell, y = all, colour=over21)) +
geom_point() + stat_smooth(method = loess) +
scale_x_continuous(breaks=seq(19, 23, 0.5),limits=c(19,23),expand=c(0,0)) +
scale_y_continuous(breaks=seq(80,110,5),limits=c(80,110),expand=c(0,0)) +
scale_color_manual(values=c("#d8b365","#5ab4ac"),
name="variable",
labels=c("Below 21", "Above 21")) +
labs(title= "Death rates from all causes against age", x = "Age",
y = "Deaths per 100,000 person-years") + theme(panel.background = element_blank()) + theme(plot.title = element_text(hjust = 1)) +theme(axis.line.x = element_line(color="black", size = 0.5),
axis.line.y = element_line(color="black", size = 0.5), axis.text.y = element_text(colour = "black"), panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "white",size=0.75)) + theme(panel.border= element_blank()) + theme(
plot.title = element_text(hjust = 0.5, size = 10, face="bold"))+
theme(legend.background = element_rect(size=0.5, linetype="solid", colour ="black"), legend.direction = "vertical", legend.title = element_blank()) + guides(col =guide_legend(nrow =3))
fig1.3
#1.1.f. Allow more flexibility to your RD
Run other RDs allowing for different intercepts and different slopes between the under- and over-21 subsamples. Several options are possible to include non-linearity in the running variable for allowing more flexibility than the linear RD. Abstracting from the choice of the correct order of polynomial, I present a higher-order polynomial specification (Equation 2) and a lower-order polynomial specification (Equation 3). In both non-linear RD models, “agecell” is centered to allow an easy interpretation of the main effect of “over21” as the treatment effect.
\[\begin{equation} \tag{2} all = \beta_{0} + \beta_{1}over21 + \beta_{2}(agecell-21) + \beta_{3}(agecell-21)^{2} + \beta_{4}over21*(agecell-21) + \beta_{4}over21*(agecell-21)^{2} + u \end{equation}\]
\[\begin{equation} \tag{3} all = \gamma_{0} + \gamma_{1}over21 + \gamma_{2}(agecell-21) + \gamma_{3}|agecell-21|^{1/2} + \beta_{4}over21*(agecell-21) + \beta_{5}over21*|agecell-21|^{1/2} + v \end{equation}\]
#to allow for flexibility I include a quadratic age term in the regression
#model with varying slope
#all = b0 + b1T + b2(agecell - 21) + b3(agecell -21)T + u
#where T = 1 if agecell >= 21
# T = 0 if agecell < 21
#Varying slope RD regression
model2 <- lm(all ~ over21 * I(agecell -21), data = df_alcohol)
stargazer(model2, type= "text", no.space =TRUE, keep.stat = c("n", "adj.rsq", "rsq"), title="simple RD regression of drinking on mortality")
##
## simple RD regression of drinking on mortality
## ===================================================
## Dependent variable:
## ---------------------------
## all
## ---------------------------------------------------
## over211 7.663***
## (1.319)
## I(agecell - 21) 0.827
## (0.819)
## over211:I(agecell - 21) -3.603***
## (1.158)
## Constant 93.618***
## (0.932)
## ---------------------------------------------------
## Observations 48
## R2 0.668
## Adjusted R2 0.645
## ===================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Using non-linear specifications, the impact estimate is still significant at the 1% level of confidence but with a slighlty higher magnitude than that from the linear specification.
Given that _1^=9.548 and _1^=13.127, the estimated impact of an MLDA at age 21 is an increase in overall mortality at age 21 of about 9.548 deaths per 100,000 person years (using Equation 2) and 13.127 deaths per 100,000 person years (using Equation 3).
The standard error for the regression in part d is 1.440 while the standard error for the regression above is 1.319. Since the standard error of the regression with varying slope is lower, then this regression preffered compared to the one in part D. The interpretation of the coefficient of estimates for over21 follows the same as part D that is alcohol consumption causes increase in mortality at age 21 by about 8 death per 100,000 person years.
#1.1.g. Again, plot the data Again, plot the data and the regression lines defined by the regression.
df_alcohol$fittedvalue2 <- predict(model2, df_alcohol)
# plotting fit
fig2.1 <- plot(df_alcohol$fittedvalue2~df_alcohol$agecell,type="l",ylim=range(85,110),
col="blue",lwd=2, xlab="Age",
ylab= "Death per 100,000 person-years",
main = "Death rates from all causes against age")
# adding the data points
points(df_alcohol$all~df_alcohol$agecell)
#Using basic plot including the regression line allowing the discontinuity
fig2.2 <- plot(df_alcohol$fittedvalue2~df_alcohol$agecell, ylim = range(85,110),
col ="blue", lwd=2, xlab="Age",
ylab = "Death per 100,000 person-years",
main = "Death rates from all causes against age")
points(all~agecell, data=df_alcohol)
#use ggplot2 for the plot of the data and the regression line
fig2.3 <- ggplot(df_alcohol, aes(x=agecell, y=fittedvalue2, colour=over21))+ geom_line() +
geom_point(aes(x=df_alcohol$agecell, y=df_alcohol$all)) +
scale_x_continuous(breaks=seq(19, 23, 0.5),limits=c(19,23),expand=c(0,0)) +
scale_y_continuous(breaks=seq(80,110,5),limits=c(80,110),expand=c(0,0)) +
scale_color_manual(values=c("#3300FF","#336633"),
name="variable",
labels=c("Below 21", "Above 21")) +
labs(title= "Death rates from all causes against age", x = "Age",
y = "Deaths per 100,000 person-years") +theme(panel.background = element_blank())+ theme(plot.title = element_text(hjust = 1)) +theme(axis.line.x = element_line(color="black", size = 0.5),
axis.line.y = element_line(color="black", size = 0.5), axis.text.y = element_text(colour = "black"), panel.grid.minor = element_blank(),
panel.grid.major = element_line(colour = "white",size=0.75)) + theme(panel.border= element_blank()) + theme(
plot.title = element_text(hjust = 0.5, size = 10, face="bold")) +
theme(legend.background = element_rect(size=0.5, linetype="solid", colour ="black"), legend.direction = "vertical", legend.title = element_blank()) + guides(col =guide_legend(nrow =3))
fig2.3
#1.1.h. Compare to pre-packaged RDD: From now on, I use the rdd package. I run the RDestimate command on all~agecell, specifying the data and the cutoff age of 21. Use the rdd package. Run the RDestimate command on all~agecell, specifying your data and the cutoff option at 21. Print the output of your regression. Plot the output of your regression (You can just write plot(name). RDestimate outputs plotable objects). Discuss the results. Do they differ from what you found? Why?
library(rdd)
model_rd <- RDestimate(all ~ agecell + over21, data = df_alcohol, cutpoint = 21)
summary(model_rd)
##
## Call:
## RDestimate(formula = all ~ agecell + over21, data = df_alcohol,
## cutpoint = 21)
##
## Type:
## fuzzy
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 1.6561 40 9.001 1.480 6.080 1.199e-09
## Half-BW 0.8281 20 9.579 1.914 5.004 5.609e-07
## Double-BW 3.3123 48 7.953 1.278 6.223 4.882e-10
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 33.08 3 36 1.900e-10
## Half-BW 29.05 3 16 1.039e-06
## Double-BW 32.54 3 44 3.064e-11
plot(model_rd)
The estimation output shows that the LATE of an MLDA at age 21 is an increase in the overall mortality at age 21 by 9 deaths per 100,000 person years, which is significant at the 1% level of significance.
The LATE is computed within a bandwidth of 1.6561 that contains 40 points “close” to the cutoff. Halving the bandwidth restricts the number of neighboring points to the cutoff that are compared, and yields an impact estimate of 9.579 deaths per 100,000 person years. Doubling the bandwidth enlarges the number of neighboring points to the cutoff that are compared, and returns an impact estimate of 7.953 deaths per 100,000 person years.
The linear RDD impact estimate (Task d.) aligns with the results using the double bandwidth, while the higher-order polynomial RDD impact estimate (Task f.) aligns with the results using half of the bandwidth.
The plot shows that using the RDestimate command, without any additional details, fits a non-linear effect of age on mortality rate.
The plot of the 95% confidence interval of the fitted line below and above the cutoff indicates the bounds that can be placed on the estimated LATE. It is easier to observe that the lower-order polynomial RDD impact estimate of 13.127 deaths per 100,000 person years as well as its higher-order polynomial counterpart do lie in the 95% confidence interval of the estimated LATE.
#1.1.i. Prepackaged linear RDD:
RRe-run an RDestimate command, this time adding the options kernel = “rectangular” and bw=2. Output the results. Plot them. Compare to previous output and discuss.
model_rd1 <- RDestimate(all ~ agecell + over21, data = df_alcohol, cutpoint = 21, kernel = "rectangular", bw=2)
summary(model_rd1)
##
## Call:
## RDestimate(formula = all ~ agecell + over21, data = df_alcohol,
## cutpoint = 21, bw = 2, kernel = "rectangular")
##
## Type:
## fuzzy
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 2 48 7.663 1.273 6.017 1.776e-09
## Half-BW 1 24 9.753 1.902 5.128 2.929e-07
## Double-BW 4 48 7.663 1.273 6.017 1.776e-09
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 29.47 3 44 1.325e-10
## Half-BW 16.82 3 20 1.083e-05
## Double-BW 29.47 3 44 1.325e-10
plot(model_rd1)
Specifying a linear RDD with a bandwidth of 2 yields the linear-based impact estimate previously found (Task d.) with similar standard errors. Since this bandwidth already contains all points of sample, doubling the bandwidth changes nothing in the results. Halving the bandwitdth to 1 gives similar results to the higher-order polynomial RDD impact estimate (Task f.) and to the pre-packaged non-linear RDD results with a bandwidth of 0.8281 (Task h.).
Hence, in certain narrow bandwidth, linear and non-linear sharp RDDs give asymptotically similar impact estimates.