Homework 6

Duflo’s Plumber Economist

2023-02-07

#Loading required packages
library(ggplot2)
library(stargazer)
library(ivreg)
library(MASS)
library(dplyr)

IV Simulations

1. Create fake data to simulate a relevant situation

set.seed(786)
n <- 1500
rho <- matrix(c(1, 0.7, 0.7, 1), 2, 2, byrow = TRUE)
errors <- mvrnorm(n, c(0,0), rho)
e <- errors[,1] 
v <- errors[,2]
Z <- runif(n, min = 0, max = 1)
X <- 1 + 0.5*Z + v
Y <- -10 + 1.5*X + e

df <- data.frame(Y,X,Z)

2. Plot the data to show what pattern you are trying to estimate

#Plotting First Stage (Correlation b/w Z and X)
plot(Z, X)

# Fitting a Line
abline(lm(X ~ Z))

3. Use R to run the relevant regression

ols <- lm(Y~X,df)

iv <- ivreg(Y~X|Z)

d. Present results and discuss

stargazer(ols,iv, title = "IV Regression", type = "text",
          dep.var.caption = "Dependent variable: Y",
          dep.var.labels.include = FALSE,
          out = "regression_results.html")
## 
## IV Regression
## =========================================================================
##                                           Dependent variable: Y          
##                                 -----------------------------------------
##                                             OLS              instrumental
##                                                                variable  
##                                             (1)                  (2)     
## -------------------------------------------------------------------------
## X                                         2.179***             1.653***  
##                                           (0.018)              (0.151)   
##                                                                          
## Constant                                 -10.823***           -10.170*** 
##                                           (0.028)              (0.189)   
##                                                                          
## -------------------------------------------------------------------------
## Observations                               1,500                1,500    
## R2                                         0.910                0.857    
## Adjusted R2                                0.910                0.857    
## Residual Std. Error (df = 1498)            0.695                0.877    
## F Statistic                     15,222.660*** (df = 1; 1498)             
## =========================================================================
## Note:                                         *p<0.1; **p<0.05; ***p<0.01

The table above shows the results for both OLS and IV regressions. One can see that OLS gives biased estimate of coefficient of X (i.e., 2.179 while I set the TRUE coeff as 1.5). Once I do IV regression, I recover close to TRUE estimate (i.e., 1.65).

DID Simulations

1. Create fake data to simulate a relevant situation

#sample size
n <- 1000
#treatment effect
beta <- 10
#treatment dummy
treat <- c(rep(0, n/2), rep(1, n/2))
#simulating outcome variable
y_pre <- 10 + rnorm(n)
y_pre[1:n/2] <- y_pre[1:n/2] - 1
y_post <- 10 + 2.5 + beta*treat + rnorm(n)
y_post[1:n/2] <- y_post[1:n/2] - 1

pre <- rep(0, length(y_pre[treat==0]))
post <- rep(1, length(y_pre[treat==0]))

df <- data.frame("Y" = c(y_pre,y_post),
                "Treatment" = treat, 
                "Period" = c(rep(1, n), rep(2, n)))

2. Plot the data to show what pattern you are trying to estimate

did_lines <- df %>%
  dplyr::group_by(Period, Treatment) %>%
  dplyr::mutate(group_mean = mean(Y)) %>%
  ggplot(aes(x = Period, y = group_mean, color = factor(Treatment))) +
  geom_point() +
  geom_line(aes(x = Period, y = group_mean)) +
  scale_x_continuous(breaks = c(0,1)) +
  scale_color_manual(name = " ",
                     values = c("blue", "green"),
                     labels = c("Control Group", "Treatment Group"))
did_lines

3. Use R to run the relevant regression

TreatmentxPeriod=df$Treatment*df$Period

OLS <- lm(Y~Treatment, df)

DID <- lm(Y~Treatment+Period+TreatmentxPeriod, df)

d. Present results and discuss

stargazer(OLS, DID, title = "DID Simulated Data", type = "text",
          dep.var.caption = "Dependent variable: Y",
          dep.var.labels.include = FALSE,
          out = "did_regression_results.html")
## 
## DID Simulated Data
## ==========================================================================
##                                     Dependent variable: Y                 
##                     ------------------------------------------------------
##                                (1)                        (2)             
## --------------------------------------------------------------------------
## Treatment                   6.032***                   -9.022***          
##                              (0.208)                    (0.145)           
##                                                                           
## Period                                                  2.556***          
##                                                         (0.065)           
##                                                                           
## TreatmentxPeriod                                       10.036***          
##                                                         (0.092)           
##                                                                           
## Constant                    10.234***                   6.401***          
##                              (0.147)                    (0.103)           
##                                                                           
## --------------------------------------------------------------------------
## Observations                  2,000                      2,000            
## R2                            0.295                      0.966            
## Adjusted R2                   0.295                      0.966            
## Residual Std. Error     4.660 (df = 1998)          1.028 (df = 1996)      
## F Statistic         837.817*** (df = 1; 1998) 18,770.500*** (df = 3; 1996)
## ==========================================================================
## Note:                                          *p<0.1; **p<0.05; ***p<0.01

In the table above, column 1 provides the results for OLS regression when I simply use “Treatment” as the right-hand side variable. The results show that I recover under-estimated treatment effect. However, once I include “Treatment” and “Period” in addition to the interaction of both, I recover the originally simulated treatment effect denoted by coefficient on “TreatmentxPeriod”.