I choose Difference-in-differences method and RDD.

DID method

1. Create fake data to simulate a relevant situation

library(dplyr)
library(ggplot2)
set.seed(123)

T = 100 # no of periods
N = 20 # no of subjects
#simulate time trend for data
ar1 <-c(0.2, 0.5)
ma1 <-c(0.3, 0.5)
ar <- as.numeric(arima.sim(n=T, list(ar = ar1 , ma = ma1)))  + 0.6 * (1:T)
#simulate other parameters
group <- rep(c("control","treat"),each = N*T/2)
D <- rep(c(0,1),each = N*T/2)
exp <-  rep(c(rep(0, T/2), rep(1, T/2)), length.out = N*T)
t<- rep(c(1:T),times = N)
mu <- rep(ar[1:T],times =  N) 
eps <- rnorm(T)
sigma = 2; tau = 22; gamma = 9; beta = 2
eps = rnorm(N*T)
Y = mu + sigma * D  + gamma * D *exp + eps

dat1 <- data.frame( t, group , D, exp, mu, eps, Y)
head(dat1, 5)
##   t   group D exp       mu        eps        Y
## 1 1 control 0   0 2.543891 -0.5743887 1.969502
## 2 2 control 0   0 3.455179 -1.3170161 2.138163
## 3 3 control 0   0 4.394281 -0.1829254 4.211356
## 4 4 control 0   0 4.495029  0.4189824 4.914012
## 5 5 control 0   0 4.668569  0.3243043 4.992874

2. Plot the data to show what pattern you are trying to estimate

gdat = dat1 %>%
  group_by(group,t,exp,D) %>%
  summarize(Y = mean(Y))

ggplot(gdat) +
  geom_line(aes(x = t, y = Y, color = group))+
  geom_vline(xintercept=T/2) +
  labs(x = "Period", y = "Outcome", color = "Treatment") 

3. Use R to run the relevant regression

#regression
library(stargazer)
g <- lm(Y ~ D *exp, data=dat1  )
stargazer(g, title="regression results", type = "text")
## 
## regression results
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                  Y             
## -----------------------------------------------
## D                            1.965***          
##                               (0.510)          
##                                                
## exp                          29.637***         
##                               (0.510)          
##                                                
## D:exp                        9.075***          
##                               (0.721)          
##                                                
## Constant                     15.657***         
##                               (0.361)          
##                                                
## -----------------------------------------------
## Observations                   2,000           
## R2                             0.826           
## Adjusted R2                    0.826           
## Residual Std. Error      8.062 (df = 1996)     
## F Statistic         3,155.705*** (df = 3; 1996)
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

4. Present results and discuss

From the plot, we can see that post-trends for treated is higher than control; and it show parallel trends assumption during pre-trends period.

The regression result show that the casual effect of treatment is 9.075, which is very similar to what we simulate, 9. The reason behind it could be that I add time trend(AR(1)) and disturbance term eps.

5. calculate ATT by hand

FTE = dat1 %>%
  group_by(exp,D) %>%
  summarize(Y = mean(Y))
FTE
## # A tibble: 4 × 3
## # Groups:   exp [2]
##     exp     D     Y
##   <dbl> <dbl> <dbl>
## 1     0     0  15.7
## 2     0     1  17.6
## 3     1     0  45.3
## 4     1     1  56.3
ATT <- as.numeric ((FTE[4,3] - FTE[3,3]) -(FTE[2,3] - FTE[1,3]))
ATT
## [1] 9.075009

Similarly as in hw5, I calculate the ATT by hand and it has the same results as I did in regression.

RDD method

1. Create fake data to simulate a relevant situation

# Simulate data for RD design
n <- 1000
cutoff <- 50
sd <- 10
x <- rnorm(n, mean = 50, sd = sd)
t <- as.numeric(x >= cutoff)
eps <- rnorm(n)
y <- 10 + 10*t + 0.5*x + eps
# Create dataframe
data <- data.frame(x = x, y = y, t = t)

2. Plot the data to show what pattern you are trying to estimate

# Plot data
ggplot()+
  geom_point(aes(x = x, y = y),data = data)+
  geom_smooth(aes(x=x,y=y),subset(data,x <cutoff)) +
  geom_smooth(aes(x=x,y=y),subset(data,x >=cutoff))

3. Use R to run the relevant regression

# Fit RD model
g1 <- lm(y ~ x + t, data = data)

g2 <- lm(y ~ x + t + I(x*t), data = data)
stargazer(g1,g2, title=  "RD regression results", type = "text")
## 
## RD regression results
## ===========================================================================
##                                       Dependent variable:                  
##                     -------------------------------------------------------
##                                                y                           
##                                 (1)                         (2)            
## ---------------------------------------------------------------------------
## x                            0.508***                    0.512***          
##                               (0.005)                     (0.007)          
##                                                                            
## t                            9.919***                    10.324***         
##                               (0.103)                     (0.526)          
##                                                                            
## I(x * t)                                                  -0.008           
##                                                           (0.010)          
##                                                                            
## Constant                     9.634***                    9.463***          
##                               (0.221)                     (0.310)          
##                                                                            
## ---------------------------------------------------------------------------
## Observations                   1,000                       1,000           
## R2                             0.989                       0.989           
## Adjusted R2                    0.989                       0.989           
## Residual Std. Error      0.983 (df = 997)            0.984 (df = 996)      
## F Statistic         46,624.340*** (df = 2; 997) 31,071.210*** (df = 3; 996)
## ===========================================================================
## Note:                                           *p<0.1; **p<0.05; ***p<0.01

4. Present results and discuss

The plots show that the data have discontinuity point at x=50. I run two regression with or without interactive term(t*x), they are all very close to my settings and the one with interactive term may be more accurate.

5. Use prepackaged linear RDD

library(rdd)
g3 <- RDestimate(y ~ x, data=data, cutpoint=50)
summary(g3)
## 
## Call:
## RDestimate(formula = y ~ x, data = data, cutpoint = 50)
## 
## Type:
## sharp 
## 
## Estimates:
##            Bandwidth  Observations  Estimate  Std. Error  z value  Pr(>|z|)  
## LATE       4.209      321           9.769     0.2512      38.89     0.000e+00
## Half-BW    2.104      168           9.927     0.3629      27.36    9.044e-165
## Double-BW  8.418      617           9.743     0.1717      56.73     0.000e+00
##               
## LATE       ***
## Half-BW    ***
## Double-BW  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## F-statistics:
##            F     Num. DoF  Denom. DoF  p
## LATE       3499  3         317         0
## Half-BW    1525  3         164         0
## Double-BW  8468  3         613         0
plot(g3)