Loading necessay packages

#Loading necessay packages 
library(haven)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(rdd)
## Loading required package: sandwich
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: AER
## Loading required package: car
## Loading required package: carData
## Loading required package: survival
## Loading required package: Formula
library(rddtools)
## Loading required package: np
## Nonparametric Kernel Methods for Mixed Datatypes (version 0.60-11)
## [vignette("np_faq",package="np") provides answers to frequently asked questions]
## [vignette("np",package="np") an overview]
## [vignette("entropy_np",package="np") an overview of entropy-based methods]
## 
## Please consider citing R and rddtools,
## citation()
## citation("rddtools")

HW06 - Regression Discontinuity

Reference: Carpenter and Dobkin(2009) The Effect of Alcohol Consumption on Mortality: Regression Discontinuity Evidence from the Minimum Drinking Age AEJ: Applied Economics 1(1)164-82.

Get the Carpenter & Dobkin and data:

#loading the dataset

RDDdata <- read_dta("AEJfigs.dta")

dim(RDDdata)
## [1] 50 19
head(RDDdata, 3)
## # A tibble: 3 x 19
##   agecell   all allfitted internal internalfitted external externalfitted
##     <dbl> <dbl>     <dbl>    <dbl>          <dbl>    <dbl>          <dbl>
## 1    19.1  92.8      91.7     16.6           16.7     76.2           75.0
## 2    19.2  95.1      91.9     18.3           16.9     76.8           75.0
## 3    19.2  92.1      92.0     18.9           17.1     73.2           75.0
## # ... with 12 more variables: alcohol <dbl>, alcoholfitted <dbl>,
## #   homicide <dbl>, homicidefitted <dbl>, suicide <dbl>, suicidefitted <dbl>,
## #   mva <dbl>, mvafitted <dbl>, drugs <dbl>, drugsfitted <dbl>,
## #   externalother <dbl>, externalotherfitted <dbl>

Make an over21 dummy to help with the following analysis.

#creating dummy variable over21

RDDdata$over21 <- ifelse(RDDdata$agecell >= 21.0, c(1), c(0))

c. Reproduce Figure 3 of the paper

No need to bother trying to get their exact formatting here (unless you love doing that). Just make sure you show the data and the discontinuities.

plot(RDDdata$agecell, RDDdata$all, pch=18,  col="black", cex=0.7,  xlab="Age", ylab="Deaths per 100,000 person-years ", main="Figure 3: Age Profile for Death Rate",ylim=c(0,120), xlim=c(19, 23) )
points(RDDdata$agecell, RDDdata$internal, pch=0, cex=0.6, col="skyblue4")
points(RDDdata$agecell, RDDdata$external, pch=17, cex=0.6, col="midnightblue")
lines(RDDdata$agecell, ifelse(RDDdata$over21==0, RDDdata$allfitted, NA), lty=3, lwd=1.5, col="black")
lines(RDDdata$agecell, ifelse(RDDdata$over21==1, RDDdata$allfitted, NA), lty=3, lwd=1.5, col="black")
lines(RDDdata$agecell, ifelse(RDDdata$over21==0, RDDdata$internalfitted, NA), lty=5, lwd=1, col="skyblue4")
lines(RDDdata$agecell, ifelse(RDDdata$over21==1, RDDdata$internalfitted, NA), lty=5, lwd=1, col="skyblue4")
lines(RDDdata$agecell, ifelse(RDDdata$over21==0, RDDdata$externalfitted, NA), lty=1, lwd=2, col="midnightblue")
lines(RDDdata$agecell, ifelse(RDDdata$over21==1, RDDdata$externalfitted, NA), lty=1, lwd=2, col="midnightblue")
legend(20,65, lty=c(0,0,0,3,3,5,5,1,1), pch=c(18, 0, 17, NA, NA, NA,  NA, NA, NA),  
       legend=c("All","Internal","External","All fitted", "Internal fitted", "External fitted"),
       col=c("black", "grey54", "midnightblue", "black", "grey54", "midnightblue"),  text.font=4,  pt.cex = 1, cex=0.8, ncol=2)

d. Simplest regression-discontinuity design

Run a simple RD regression (same slope, with a jump) for “all” deaths by age (I don’t mean all the variables, just the one variable that is called “all”)

#Same slope 

model_rdd_SS <- lm(all~over21+I(agecell-21), data=RDDdata)

summary(model_rdd_SS)
## 
## Call:
## lm(formula = all ~ over21 + I(agecell - 21), data = RDDdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0559 -1.8483  0.1149  1.4909  5.8043 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      91.8414     0.8050 114.083  < 2e-16 ***
## over21            7.6627     1.4403   5.320 3.15e-06 ***
## I(agecell - 21)  -0.9747     0.6325  -1.541     0.13    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.493 on 45 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.5946, Adjusted R-squared:  0.5765 
## F-statistic: 32.99 on 2 and 45 DF,  p-value: 1.508e-09

Analyse the results. How do you use these results to estimate the relationship between drinking and mortality?

stargazer(model_rdd_SS, type="text", title="Table1: Estimating the relationship between drinking and mortality ", align=TRUE, dep.var.labels = "Mortality rate", covariate.labels=c("Deaths due to all causes
Over 21"))
## 
## Table1: Estimating the relationship between drinking and mortality
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                           Mortality rate       
## -----------------------------------------------
## Over 21                      7.663***          
##                               (1.440)          
##                                                
## I(agecell - 21)               -0.975           
##                               (0.632)          
##                                                
## Constant                     91.841***         
##                               (0.805)          
##                                                
## -----------------------------------------------
## Observations                    48             
## R2                             0.595           
## Adjusted R2                    0.577           
## Residual Std. Error       2.493 (df = 45)      
## F Statistic           32.995*** (df = 2; 45)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

In the above analysis, the average treatment effect is represented by the coefficient of the dummy variable over21. The mortality rate or deaths per 100,000 for individuals with age over 21 (minimum drinking age) is 7.663 points higher than individuals with age below 21 years.

e. Plot the results

Plot the all variable by age, and add the regression lines defined by your regression output

plot(RDDdata$agecell, RDDdata$all, xlab="Age", ylab="Mortality rate (per 100,000)", main="Regression-Discontinuity Design-Same Slope")
abline(lm(all~agecell, data=RDDdata), col="red")

f. Allow more flexibility to your RD

Run another RD where not only the intercept, but also the slope can differ between the under- and over-21 sub samples.

#Different Slope 

model_rdd_DS <- lm(all~over21+I(agecell-21)+over21*I(agecell-21), data=RDDdata) 

summary(model_rdd_DS)
## 
## Call:
## lm(formula = all ~ over21 + I(agecell - 21) + over21 * I(agecell - 
##     21), data = RDDdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.368 -1.787  0.117  1.108  5.341 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             93.6184     0.9325 100.399  < 2e-16 ***
## over21                   7.6627     1.3187   5.811  6.4e-07 ***
## I(agecell - 21)          0.8270     0.8189   1.010  0.31809    
## over21:I(agecell - 21)  -3.6034     1.1581  -3.111  0.00327 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.283 on 44 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.6677, Adjusted R-squared:  0.645 
## F-statistic: 29.47 on 3 and 44 DF,  p-value: 1.325e-10

Analyse the results. What is your estimate of interest. Does it differ from the previous one?

stargazer(model_rdd_DS, type="text", title="Table2: Estimating the relationship between drinking and mortality with different slopes ", align=TRUE, dep.var.labels = "Mortality rate", covariate.labels=c("Deaths due to all causes Over 21", "I(agecell-21)", "over21*I(agecell-21)"))
## 
## Table2: Estimating the relationship between drinking and mortality with different slopes
## ============================================================
##                                      Dependent variable:    
##                                  ---------------------------
##                                        Mortality rate       
## ------------------------------------------------------------
## Deaths due to all causes Over 21          7.663***          
##                                            (1.319)          
##                                                             
## I(agecell-21)                               0.827           
##                                            (0.819)          
##                                                             
## over21*I(agecell-21)                      -3.603***         
##                                            (1.158)          
##                                                             
## Constant                                  93.618***         
##                                            (0.932)          
##                                                             
## ------------------------------------------------------------
## Observations                                 48             
## R2                                          0.668           
## Adjusted R2                                 0.645           
## Residual Std. Error                    2.283 (df = 44)      
## F Statistic                        29.466*** (df = 3; 44)   
## ============================================================
## Note:                            *p<0.1; **p<0.05; ***p<0.01

Running the more flexible RD model with different slope does not alter the result of average treatment effect. The estimate of interest is the coefficient of over21 dummy variable (average treatment effect). Deaths per 100,000 person is 7.663 points higher for individuals over age 21 years.

g. Again, plot the data

Again, plot the data and the regression lines defined by the regression.

plot(RDDdata$agecell, RDDdata$all, xlab="Age", ylab="Mortality rate (per 100,000)", main="Regression-Discontinuity Design-Different Slope")
abline(lm(all~agecell+over21*I(agecell-21), data=RDDdata), col="red")
## Warning in abline(lm(all ~ agecell + over21 * I(agecell - 21), data =
## RDDdata), : only using the first two of 5 regression coefficients

f. Compare to pre-packaged RDD:

Use the rdd package. Run the RDestimate command on all~agecell, specifying your data and the cutoff option at 21. Print the output of your regression.

newResult <- RDestimate(all~agecell, data=RDDdata, subset = NULL, cutpoint = 21, bw = NULL)

newResult
## 
## Call:
## RDestimate(formula = all ~ agecell, data = RDDdata, subset = NULL, 
##     cutpoint = 21, bw = NULL)
## 
## Coefficients:
##      LATE    Half-BW  Double-BW  
##     9.001      9.579      7.953

Plot the output of your regression

plot(newResult)

Discuss the results. Do they differ from what you found? Why?

The coefficients of local average treatments effects (LATE) is 9.001 which is higher than the average treatment effect from simple regression analysis (7.663). The coefficient of LATE is more centered on the cutoff age of 21 years (bw=NULL).

g. Prepackaged linear RDD:

Re-run an RDestimate command, this time adding the options kernel = “rectangular” and bw=2. Output the results. Plot them. Compare to previous output and discuss.

newResult1 <- RDestimate(all~agecell, data=RDDdata, subset = NULL, cutpoint = 21, bw = 2, kernel = "rectangular")

newResult1
## 
## Call:
## RDestimate(formula = all ~ agecell, data = RDDdata, subset = NULL, 
##     cutpoint = 21, bw = 2, kernel = "rectangular")
## 
## Coefficients:
##      LATE    Half-BW  Double-BW  
##     7.663      9.753      7.663
plot(newResult1)

Changing the bin width to 2 (the spread of age around cutoff age) matches the coefficients of LATE to average treatment effect from simple regression of discontinuity (7.663)