library(haven)
rd <- read_dta("C:/Users/ho643/OneDrive - University of Georgia/UGA/2023 Spring/AAEC8610 Adv Quant Meth Econ/HW/HW6/AEJfigs.dta")
#make a dummy of over21
rd$over21 <- ifelse(rd$agecell > 21, 1, 0)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#Reshape the data
rd_long <- rd %>%
select(agecell,all,allfitted,internal,internalfitted,external,externalfitted) %>%
gather(value = "value", key = "vars", -agecell)
## Warning: attributes are not identical across measure variables;
## they will be dropped
f1<-ggplot(na.omit(rd_long), aes(x = agecell, y = value))+
geom_point(aes(color = vars)) +
scale_shape_manual(values = c(18, 17,0), names("")) +
scale_x_continuous(breaks=seq(19, 23, 0.5),limits=c(19,23),expand=c(0,0)) +
labs(title = "Age Profile for Death Rates", x = "Age", y = "Deaths per 100,000 person-years")+
scale_y_continuous(breaks=seq(0,120,20),limits=c(0,120),expand=c(0,0))
f1
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
rd_reg <- lm(all ~ over21 + agecell, data=rd)
stargazer(rd_reg, title="Simple RD regression", type="text")
##
## Simple RD regression
## ===============================================
## Dependent variable:
## ---------------------------
## all
## -----------------------------------------------
## over21 7.663***
## (1.440)
##
## agecell -0.975
## (0.632)
##
## Constant 112.310***
## (12.668)
##
## -----------------------------------------------
## Observations 48
## R2 0.595
## Adjusted R2 0.577
## Residual Std. Error 2.493 (df = 45)
## F Statistic 32.995*** (df = 2; 45)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
The dummy variable of over 21 shows significant coefficient. It can be interpreted as death rates for people over 21 are higher than those under 21.This means that death increases by 7.7 individuals per 100,000 persons at age 21. It can be used as LATE to estimate the relationship. The agecell does not shows significant coefficient.
rd$fit <- predict(rd_reg, rd)
rd$over21 <- as.factor(rd$over21)
f2 <- ggplot(na.omit(rd), aes(x=agecell, y=fit, color=over21))+
geom_line() +
geom_point(aes(x=agecell, y=all)) +
labs(title="Regression Discontinuity Plot", x = "Age", y = "Deaths", colour = "Over 21")+
scale_color_manual(values=c("red","green"),labels=c("under 21", "over 21"))
f2
rd_interac <- lm(all ~ over21 + agecell+over21 *agecell, data=rd)
stargazer(rd_interac, "Simple RD regression with interaction term", type= "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## all
## -----------------------------------------------
## over211 83.333***
## (24.357)
##
## agecell 0.827
## (0.819)
##
## over211:agecell -3.603***
## (1.158)
##
## Constant 76.251***
## (16.396)
##
## -----------------------------------------------
## Observations 48
## R2 0.668
## Adjusted R2 0.645
## Residual Std. Error 2.283 (df = 44)
## F Statistic 29.466*** (df = 3; 44)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## ==========================================
## Simple RD regression with interaction term
## ------------------------------------------
The dummy variable of over 21 shows significant coefficient, which is higher than the previous results. It can be interpreted as death rates for people over 21 are higher than those under 21. It can be used as LATE to estimate the relationship. The agecell does not shows significant coefficient. The results are different than the previous one due to adding flexibility in the regression.
rd$fit2 <- predict(rd_interac, rd)
f3 <- ggplot(na.omit(rd), aes(x=agecell, y=fit2, color=over21))+
geom_line() +
geom_point(aes(x=agecell, y=all)) +
labs(title="Regression Discontinuity with Interaction Term Plot", x = "Age", y = "Deaths", colour = "Over 21")+
scale_color_manual(values=c("red","green"),labels=c("under 21", "over 21"))
f3
#install.packages("rdd")
library(rdd)
## Loading required package: sandwich
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: AER
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: survival
## Loading required package: Formula
rd_pac <- RDestimate(all ~ agecell, data = rd, cutpoint = 21)
summary(rd_pac)
##
## Call:
## RDestimate(formula = all ~ agecell, data = rd, cutpoint = 21)
##
## Type:
## sharp
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 1.6561 40 9.001 1.480 6.080 1.199e-09
## Half-BW 0.8281 20 9.579 1.914 5.004 5.609e-07
## Double-BW 3.3123 48 7.953 1.278 6.223 4.882e-10
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 33.08 3 36 3.799e-10
## Half-BW 29.05 3 16 2.078e-06
## Double-BW 32.54 3 44 6.129e-11
When we look at the Late estimate, we can see that the coefficient is larger than the previous results. The coefficient means that death increases by 9 individuals per 100,000 persons at age 21. However, the standard error is smaller. It is because the bandwidth we choose is different than the prepackaged Rdd or prepackaged Rdd use local polynomial regression method.
plot(rd_pac)
rd_linear <- RDestimate(all ~ agecell, data = rd, cutpoint = 21, kernel = "rectangular", bw=2)
summary(rd_linear)
##
## Call:
## RDestimate(formula = all ~ agecell, data = rd, cutpoint = 21,
## bw = 2, kernel = "rectangular")
##
## Type:
## sharp
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 2 48 7.663 1.273 6.017 1.776e-09
## Half-BW 1 24 9.753 1.902 5.128 2.929e-07
## Double-BW 4 48 7.663 1.273 6.017 1.776e-09
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 29.47 3 44 2.651e-10
## Half-BW 16.82 3 20 2.167e-05
## Double-BW 29.47 3 44 2.651e-10
plot(rd_linear)
This results are different than the previous prepackaged RDD results, while it is the same results as the one we obtained in the regression. However, the standard error is a bit smaller. It is because this current prepackaged RDD use linear regression method as well as we chose the bandwidth and kernel.