#Loading required packages
library(ggplot2)
library(stargazer)
library(ivreg)
library(MASS)
library(dplyr)IV Simulations
1. Create fake data to simulate a relevant situation
set.seed(786)
n <- 1500
rho <- matrix(c(1, 0.7, 0.7, 1), 2, 2, byrow = TRUE)
errors <- mvrnorm(n, c(0,0), rho)
e <- errors[,1]
v <- errors[,2]
Z <- runif(n, min = 0, max = 1)
X <- 1 + 0.5*Z + v
Y <- -10 + 1.5*X + e
df <- data.frame(Y,X,Z)2. Plot the data to show what pattern you are trying to estimate
#Plotting First Stage (Correlation b/w Z and X)
plot(Z, X)
# Fitting a Line
abline(lm(X ~ Z))3. Use R to run the relevant regression
ols <- lm(Y~X,df)
iv <- ivreg(Y~X|Z)d. Present results and discuss
stargazer(ols,iv, title = "IV Regression", type = "text",
dep.var.caption = "Dependent variable: Y",
dep.var.labels.include = FALSE,
out = "regression_results.html")##
## IV Regression
## =========================================================================
## Dependent variable: Y
## -----------------------------------------
## OLS instrumental
## variable
## (1) (2)
## -------------------------------------------------------------------------
## X 2.179*** 1.653***
## (0.018) (0.151)
##
## Constant -10.823*** -10.170***
## (0.028) (0.189)
##
## -------------------------------------------------------------------------
## Observations 1,500 1,500
## R2 0.910 0.857
## Adjusted R2 0.910 0.857
## Residual Std. Error (df = 1498) 0.695 0.877
## F Statistic 15,222.660*** (df = 1; 1498)
## =========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
The table above shows the results for both OLS and IV regressions. One can see that OLS gives biased estimate of coefficient of X (i.e., 2.179 while I set the TRUE coeff as 1.5). Once I do IV regression, I recover close to TRUE estimate (i.e., 1.65).
DID Simulations
1. Create fake data to simulate a relevant situation
#sample size
n <- 1000
#treatment effect
beta <- 10
#treatment dummy
treat <- c(rep(0, n/2), rep(1, n/2))
#simulating outcome variable
y_pre <- 10 + rnorm(n)
y_pre[1:n/2] <- y_pre[1:n/2] - 1
y_post <- 10 + 2.5 + beta*treat + rnorm(n)
y_post[1:n/2] <- y_post[1:n/2] - 1
pre <- rep(0, length(y_pre[treat==0]))
post <- rep(1, length(y_pre[treat==0]))
df <- data.frame("Y" = c(y_pre,y_post),
"Treatment" = treat,
"Period" = c(rep(1, n), rep(2, n)))2. Plot the data to show what pattern you are trying to estimate
did_lines <- df %>%
dplyr::group_by(Period, Treatment) %>%
dplyr::mutate(group_mean = mean(Y)) %>%
ggplot(aes(x = Period, y = group_mean, color = factor(Treatment))) +
geom_point() +
geom_line(aes(x = Period, y = group_mean)) +
scale_x_continuous(breaks = c(0,1)) +
scale_color_manual(name = " ",
values = c("blue", "green"),
labels = c("Control Group", "Treatment Group"))
did_lines3. Use R to run the relevant regression
TreatmentxPeriod=df$Treatment*df$Period
OLS <- lm(Y~Treatment, df)
DID <- lm(Y~Treatment+Period+TreatmentxPeriod, df)d. Present results and discuss
stargazer(OLS, DID, title = "DID Simulated Data", type = "text",
dep.var.caption = "Dependent variable: Y",
dep.var.labels.include = FALSE,
out = "did_regression_results.html")##
## DID Simulated Data
## ==========================================================================
## Dependent variable: Y
## ------------------------------------------------------
## (1) (2)
## --------------------------------------------------------------------------
## Treatment 6.032*** -9.022***
## (0.208) (0.145)
##
## Period 2.556***
## (0.065)
##
## TreatmentxPeriod 10.036***
## (0.092)
##
## Constant 10.234*** 6.401***
## (0.147) (0.103)
##
## --------------------------------------------------------------------------
## Observations 2,000 2,000
## R2 0.295 0.966
## Adjusted R2 0.295 0.966
## Residual Std. Error 4.660 (df = 1998) 1.028 (df = 1996)
## F Statistic 837.817*** (df = 1; 1998) 18,770.500*** (df = 3; 1996)
## ==========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
In the table above, column 1 provides the results for OLS regression when I simply use “Treatment” as the right-hand side variable. The results show that I recover under-estimated treatment effect. However, once I include “Treatment” and “Period” in addition to the interaction of both, I recover the originally simulated treatment effect denoted by coefficient on “TreatmentxPeriod”.