knitr::opts_chunk$set(echo = TRUE)
library(AER)
## Warning: package 'AER' was built under R version 4.3.3
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(ggplot2)
library(ggthemes)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Create the above data set 1000 times for visualization

sampleSize <- 1000
myList = list()
set.seed(12345)
for (i in 1L:1000L) {
  omVar <- rnorm(sampleSize)
  z1 <- rnorm(sampleSize)
  z2 <- rnorm(sampleSize)
  x2 <- rnorm(sampleSize)
  x1 <- z1 + z2 + omVar
  y <- 2 + 3*x1 + 1*x2 + 3*omVar + 10*rnorm(sampleSize)

  myList[[i]] <- data.frame(y = y,
                            x1 = x1,
                            x2 = x2,
                            z1 = z1,
                            z2 = z2,
                            sample = i)
  rm(omVar, z1, z2, x1, x2, y)
}

myData <- data.table::rbindlist(myList)

Graphs

OLS

m1 = data.frame(Intercept = NA,
                x1 = NA,
                x2 = NA)
for (i in 1L:1000L) {
  m1 <- rbind(m1, lm(y ~ x1 + x2, subset(myData,sample == i))$coefficients)
}
m1 = m1[-1,]

ZKHL

m2 <- data.frame(Intercept = NA,
                 x1_1 = NA,
                 x2 = NA)
for (i in 1L:1000L) {
  myData$x1_1[myData$sample == i] <- lm(x1~z1 + x2, subset(myData,sample == i))$residuals
  m2 <- rbind(m2, lm(y ~ x1_1 + x2, subset(myData,sample == i))$coefficients)
}
m2 <- m2[-1,]

2SLS

m3 <- data.frame(Intercept = NA,
                 x1_2 = NA,
                 x2 = NA)
for (i in 1L:1000L) {
  myData$x1_2[myData$sample == i] <- lm(x1~z1 + x2, subset(myData,sample == i))$fitted.values
  m3 <- rbind(m3, lm(y ~ x1_2 + x2, subset(myData,sample == i))$coefficients)
}
m3 <- m3[-1,]

Three plots together for comparison

m1$Model <- 'OLS'
m2$Model <- 'ZKHL'
m3$Model <- '2SLS'
mMeans <- rbind(m1, setNames(m2, names(m1)), setNames(m3,names(m1))) %>%
  mutate(Model2 = factor(Model, levels = c('OLS','ZKHL','2SLS'))) %>%
  group_by(Model2) %>%
  summarise(x1 = mean(x1))

rbind(m1,setNames(m2, names(m1)),setNames(m3,names(m1))) %>%
  mutate(Model2 = factor(Model, levels = c('OLS','ZKHL','2SLS'))) %>%
  ggplot(aes(x1)) +
  geom_histogram(bins = 40) +
  geom_vline(aes(xintercept = x1, color = Model2), data = mMeans) +
  theme_fivethirtyeight() +
  facet_wrap(~Model2, nrow = 3) +
  theme(legend.position = 'none')

Reporting in the paper

summary(lm(y ~ x1 + x2, subset(myData,sample == 1)))
## 
## Call:
## lm(formula = y ~ x1 + x2, data = subset(myData, sample == 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.116  -7.103   0.418   6.977  34.033 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0868     0.3351   6.227 6.98e-10 ***
## x1            3.7352     0.1945  19.202  < 2e-16 ***
## x2            1.2846     0.3521   3.649 0.000277 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.6 on 997 degrees of freedom
## Multiple R-squared:  0.275,  Adjusted R-squared:  0.2735 
## F-statistic: 189.1 on 2 and 997 DF,  p-value: < 2.2e-16

Model with ZKHL method

myData$x1_1[myData$sample == 1] <- lm(x1~z1 + x2, subset(myData,sample == 1))$residuals
summary(lm(y ~ x1_1 + x2, subset(myData,sample == 1)))
## 
## Call:
## lm(formula = y ~ x1_1 + x2, data = subset(myData, sample == 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.372  -7.525   0.097   7.036  37.858 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0538     0.3438   5.974 3.22e-09 ***
## x1_1          4.3416     0.2505  17.335  < 2e-16 ***
## x2            1.0807     0.3611   2.993  0.00283 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.87 on 997 degrees of freedom
## Multiple R-squared:  0.2369, Adjusted R-squared:  0.2353 
## F-statistic: 154.7 on 2 and 997 DF,  p-value: < 2.2e-16

Model with correct method

myData$x1_2[myData$sample == 1] <- lm(x1~z1 + x2, subset(myData,sample == 1))$fitted.values
summary(lm(y ~ x1_2 + x2, subset(myData,sample == 1)))
## 
## Call:
## lm(formula = y ~ x1_2 + x2, data = subset(myData, sample == 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.083  -8.018  -0.482   7.393  37.011 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0774     0.3821   5.437 6.83e-08 ***
## x1_2          2.6805     0.3671   7.302 5.81e-13 ***
## x2            1.2270     0.4018   3.054  0.00232 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.08 on 997 degrees of freedom
## Multiple R-squared:  0.05727,    Adjusted R-squared:  0.05538 
## F-statistic: 30.28 on 2 and 997 DF,  p-value: 1.707e-13

2SLS using ivreg function

ivM1 <- ivreg(y ~ x1 + x2|x2 + z1, data = subset(myData,sample == 1))
summary(ivM1, diagnostics = TRUE)
## 
## Call:
## ivreg(formula = y ~ x1 + x2 | x2 + z1, data = subset(myData, 
##     sample == 1))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -39.9030  -7.1996   0.3698   6.9697  33.9642 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0774     0.3400   6.110 1.43e-09 ***
## x1            2.6805     0.3267   8.206 7.02e-16 ***
## x2            1.2270     0.3575   3.432 0.000624 ***
## 
## Diagnostic tests:
##                  df1 df2 statistic p-value    
## Weak instruments   1 997    573.24 < 2e-16 ***
## Wu-Hausman         1 996     17.18 3.7e-05 ***
## Sargan             0  NA        NA      NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.75 on 997 degrees of freedom
## Multiple R-Squared: 0.2536,  Adjusted R-squared: 0.2521 
## Wald test: 38.25 on 2 and 997 DF,  p-value: < 2.2e-16

What if we have more than one instrument for x1?

ivM2 <- ivreg(y ~ x1 + x2|x2 + z1 + z2, data = subset(myData,sample == 1))
summary(ivM2, diagnostics = TRUE)
## 
## Call:
## ivreg(formula = y ~ x1 + x2 | x2 + z1 + z2, data = subset(myData, 
##     sample == 1))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -39.9440  -7.1656   0.3717   6.9970  33.9605 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0770     0.3405   6.099 1.52e-09 ***
## x1            2.6254     0.2424  10.830  < 2e-16 ***
## x2            1.2240     0.3579   3.420 0.000651 ***
## 
## Diagnostic tests:
##                  df1 df2 statistic  p-value    
## Weak instruments   2 996   988.150  < 2e-16 ***
## Wu-Hausman         1 996    68.988 3.21e-16 ***
## Sargan             1  NA     0.063    0.802    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.77 on 997 degrees of freedom
## Multiple R-Squared: 0.2513,  Adjusted R-squared: 0.2498 
## Wald test: 63.21 on 2 and 997 DF,  p-value: < 2.2e-16

Control function

summary(lm(y ~ x1 + x1_1 + x2, subset(myData,sample == 1)))
## 
## Call:
## lm(formula = y ~ x1 + x1_1 + x2, data = subset(myData, sample == 
##     1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.552  -7.083   0.448   6.630  33.311 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0774     0.3324   6.249  6.1e-10 ***
## x1            2.6805     0.3194   8.393  < 2e-16 ***
## x1_1          1.6611     0.4008   4.144  3.7e-05 ***
## x2            1.2270     0.3495   3.510 0.000467 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.51 on 996 degrees of freedom
## Multiple R-squared:  0.2873, Adjusted R-squared:  0.2851 
## F-statistic: 133.8 on 3 and 996 DF,  p-value: < 2.2e-16