I choose Difference-in-differences method and RDD.
library(dplyr)
library(ggplot2)
set.seed(123)
T = 100 # no of periods
N = 20 # no of subjects
#simulate time trend for data
ar1 <-c(0.2, 0.5)
ma1 <-c(0.3, 0.5)
ar <- as.numeric(arima.sim(n=T, list(ar = ar1 , ma = ma1))) + 0.6 * (1:T)
#simulate other parameters
group <- rep(c("control","treat"),each = N*T/2)
D <- rep(c(0,1),each = N*T/2)
exp <- rep(c(rep(0, T/2), rep(1, T/2)), length.out = N*T)
t<- rep(c(1:T),times = N)
mu <- rep(ar[1:T],times = N)
eps <- rnorm(T)
sigma = 2; tau = 22; gamma = 9; beta = 2
eps = rnorm(N*T)
Y = mu + sigma * D + gamma * D *exp + eps
dat1 <- data.frame( t, group , D, exp, mu, eps, Y)
head(dat1, 5)
## t group D exp mu eps Y
## 1 1 control 0 0 2.543891 -0.5743887 1.969502
## 2 2 control 0 0 3.455179 -1.3170161 2.138163
## 3 3 control 0 0 4.394281 -0.1829254 4.211356
## 4 4 control 0 0 4.495029 0.4189824 4.914012
## 5 5 control 0 0 4.668569 0.3243043 4.992874
gdat = dat1 %>%
group_by(group,t,exp,D) %>%
summarize(Y = mean(Y))
ggplot(gdat) +
geom_line(aes(x = t, y = Y, color = group))+
geom_vline(xintercept=T/2) +
labs(x = "Period", y = "Outcome", color = "Treatment")
#regression
library(stargazer)
g <- lm(Y ~ D *exp, data=dat1 )
stargazer(g, title="regression results", type = "text")
##
## regression results
## ===============================================
## Dependent variable:
## ---------------------------
## Y
## -----------------------------------------------
## D 1.965***
## (0.510)
##
## exp 29.637***
## (0.510)
##
## D:exp 9.075***
## (0.721)
##
## Constant 15.657***
## (0.361)
##
## -----------------------------------------------
## Observations 2,000
## R2 0.826
## Adjusted R2 0.826
## Residual Std. Error 8.062 (df = 1996)
## F Statistic 3,155.705*** (df = 3; 1996)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
From the plot, we can see that post-trends for treated is higher than control; and it show parallel trends assumption during pre-trends period.
The regression result show that the casual effect of treatment is 9.075, which is very similar to what we simulate, 9. The reason behind it could be that I add time trend(AR(1)) and disturbance term eps.
FTE = dat1 %>%
group_by(exp,D) %>%
summarize(Y = mean(Y))
FTE
## # A tibble: 4 × 3
## # Groups: exp [2]
## exp D Y
## <dbl> <dbl> <dbl>
## 1 0 0 15.7
## 2 0 1 17.6
## 3 1 0 45.3
## 4 1 1 56.3
ATT <- as.numeric ((FTE[4,3] - FTE[3,3]) -(FTE[2,3] - FTE[1,3]))
ATT
## [1] 9.075009
Similarly as in hw5, I calculate the ATT by hand and it has the same results as I did in regression.
# Simulate data for RD design
n <- 1000
cutoff <- 50
sd <- 10
x <- rnorm(n, mean = 50, sd = sd)
t <- as.numeric(x >= cutoff)
eps <- rnorm(n)
y <- 10 + 10*t + 0.5*x + eps
# Create dataframe
data <- data.frame(x = x, y = y, t = t)
# Plot data
ggplot()+
geom_point(aes(x = x, y = y),data = data)+
geom_smooth(aes(x=x,y=y),subset(data,x <cutoff)) +
geom_smooth(aes(x=x,y=y),subset(data,x >=cutoff))
# Fit RD model
g1 <- lm(y ~ x + t, data = data)
g2 <- lm(y ~ x + t + I(x*t), data = data)
stargazer(g1,g2, title= "RD regression results", type = "text")
##
## RD regression results
## ===========================================================================
## Dependent variable:
## -------------------------------------------------------
## y
## (1) (2)
## ---------------------------------------------------------------------------
## x 0.508*** 0.512***
## (0.005) (0.007)
##
## t 9.919*** 10.324***
## (0.103) (0.526)
##
## I(x * t) -0.008
## (0.010)
##
## Constant 9.634*** 9.463***
## (0.221) (0.310)
##
## ---------------------------------------------------------------------------
## Observations 1,000 1,000
## R2 0.989 0.989
## Adjusted R2 0.989 0.989
## Residual Std. Error 0.983 (df = 997) 0.984 (df = 996)
## F Statistic 46,624.340*** (df = 2; 997) 31,071.210*** (df = 3; 996)
## ===========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
The plots show that the data have discontinuity point at x=50. I run two regression with or without interactive term(t*x), they are all very close to my settings and the one with interactive term may be more accurate.
library(rdd)
g3 <- RDestimate(y ~ x, data=data, cutpoint=50)
summary(g3)
##
## Call:
## RDestimate(formula = y ~ x, data = data, cutpoint = 50)
##
## Type:
## sharp
##
## Estimates:
## Bandwidth Observations Estimate Std. Error z value Pr(>|z|)
## LATE 4.209 321 9.769 0.2512 38.89 0.000e+00
## Half-BW 2.104 168 9.927 0.3629 27.36 9.044e-165
## Double-BW 8.418 617 9.743 0.1717 56.73 0.000e+00
##
## LATE ***
## Half-BW ***
## Double-BW ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## F-statistics:
## F Num. DoF Denom. DoF p
## LATE 3499 3 317 0
## Half-BW 1525 3 164 0
## Double-BW 8468 3 613 0
plot(g3)