knitr::opts_chunk$set(echo = TRUE)
library(AER)
## Warning: package 'AER' was built under R version 4.3.3
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(ggplot2)
library(ggthemes)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Create the above data set 1000 times for visualization
sampleSize <- 1000
myList = list()
set.seed(12345)
for (i in 1L:1000L) {
omVar <- rnorm(sampleSize)
z1 <- rnorm(sampleSize)
z2 <- rnorm(sampleSize)
x2 <- rnorm(sampleSize)
x1 <- z1 + z2 + omVar
y <- 2 + 3*x1 + 1*x2 + 3*omVar + 10*rnorm(sampleSize)
myList[[i]] <- data.frame(y = y,
x1 = x1,
x2 = x2,
z1 = z1,
z2 = z2,
sample = i)
rm(omVar, z1, z2, x1, x2, y)
}
myData <- data.table::rbindlist(myList)
Graphs
OLS
m1 = data.frame(Intercept = NA,
x1 = NA,
x2 = NA)
for (i in 1L:1000L) {
m1 <- rbind(m1, lm(y ~ x1 + x2, subset(myData,sample == i))$coefficients)
}
m1 = m1[-1,]
ZKHL
m2 <- data.frame(Intercept = NA,
x1_1 = NA,
x2 = NA)
for (i in 1L:1000L) {
myData$x1_1[myData$sample == i] <- lm(x1~z1 + x2, subset(myData,sample == i))$residuals
m2 <- rbind(m2, lm(y ~ x1_1 + x2, subset(myData,sample == i))$coefficients)
}
m2 <- m2[-1,]
2SLS
m3 <- data.frame(Intercept = NA,
x1_2 = NA,
x2 = NA)
for (i in 1L:1000L) {
myData$x1_2[myData$sample == i] <- lm(x1~z1 + x2, subset(myData,sample == i))$fitted.values
m3 <- rbind(m3, lm(y ~ x1_2 + x2, subset(myData,sample == i))$coefficients)
}
m3 <- m3[-1,]
Three plots together for comparison
m1$Model <- 'OLS'
m2$Model <- 'ZKHL'
m3$Model <- '2SLS'
mMeans <- rbind(m1, setNames(m2, names(m1)), setNames(m3,names(m1))) %>%
mutate(Model2 = factor(Model, levels = c('OLS','ZKHL','2SLS'))) %>%
group_by(Model2) %>%
summarise(x1 = mean(x1))
rbind(m1,setNames(m2, names(m1)),setNames(m3,names(m1))) %>%
mutate(Model2 = factor(Model, levels = c('OLS','ZKHL','2SLS'))) %>%
ggplot(aes(x1)) +
geom_histogram(bins = 40) +
geom_vline(aes(xintercept = x1, color = Model2), data = mMeans) +
theme_fivethirtyeight() +
facet_wrap(~Model2, nrow = 3) +
theme(legend.position = 'none')

Reporting in the paper
summary(lm(y ~ x1 + x2, subset(myData,sample == 1)))
##
## Call:
## lm(formula = y ~ x1 + x2, data = subset(myData, sample == 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.116 -7.103 0.418 6.977 34.033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0868 0.3351 6.227 6.98e-10 ***
## x1 3.7352 0.1945 19.202 < 2e-16 ***
## x2 1.2846 0.3521 3.649 0.000277 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.6 on 997 degrees of freedom
## Multiple R-squared: 0.275, Adjusted R-squared: 0.2735
## F-statistic: 189.1 on 2 and 997 DF, p-value: < 2.2e-16
Model with ZKHL method
myData$x1_1[myData$sample == 1] <- lm(x1~z1 + x2, subset(myData,sample == 1))$residuals
summary(lm(y ~ x1_1 + x2, subset(myData,sample == 1)))
##
## Call:
## lm(formula = y ~ x1_1 + x2, data = subset(myData, sample == 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.372 -7.525 0.097 7.036 37.858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0538 0.3438 5.974 3.22e-09 ***
## x1_1 4.3416 0.2505 17.335 < 2e-16 ***
## x2 1.0807 0.3611 2.993 0.00283 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.87 on 997 degrees of freedom
## Multiple R-squared: 0.2369, Adjusted R-squared: 0.2353
## F-statistic: 154.7 on 2 and 997 DF, p-value: < 2.2e-16
Model with correct method
myData$x1_2[myData$sample == 1] <- lm(x1~z1 + x2, subset(myData,sample == 1))$fitted.values
summary(lm(y ~ x1_2 + x2, subset(myData,sample == 1)))
##
## Call:
## lm(formula = y ~ x1_2 + x2, data = subset(myData, sample == 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.083 -8.018 -0.482 7.393 37.011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0774 0.3821 5.437 6.83e-08 ***
## x1_2 2.6805 0.3671 7.302 5.81e-13 ***
## x2 1.2270 0.4018 3.054 0.00232 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.08 on 997 degrees of freedom
## Multiple R-squared: 0.05727, Adjusted R-squared: 0.05538
## F-statistic: 30.28 on 2 and 997 DF, p-value: 1.707e-13
2SLS using ivreg function
ivM1 <- ivreg(y ~ x1 + x2|x2 + z1, data = subset(myData,sample == 1))
summary(ivM1, diagnostics = TRUE)
##
## Call:
## ivreg(formula = y ~ x1 + x2 | x2 + z1, data = subset(myData,
## sample == 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.9030 -7.1996 0.3698 6.9697 33.9642
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0774 0.3400 6.110 1.43e-09 ***
## x1 2.6805 0.3267 8.206 7.02e-16 ***
## x2 1.2270 0.3575 3.432 0.000624 ***
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 1 997 573.24 < 2e-16 ***
## Wu-Hausman 1 996 17.18 3.7e-05 ***
## Sargan 0 NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.75 on 997 degrees of freedom
## Multiple R-Squared: 0.2536, Adjusted R-squared: 0.2521
## Wald test: 38.25 on 2 and 997 DF, p-value: < 2.2e-16
What if we have more than one instrument for x1?
ivM2 <- ivreg(y ~ x1 + x2|x2 + z1 + z2, data = subset(myData,sample == 1))
summary(ivM2, diagnostics = TRUE)
##
## Call:
## ivreg(formula = y ~ x1 + x2 | x2 + z1 + z2, data = subset(myData,
## sample == 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.9440 -7.1656 0.3717 6.9970 33.9605
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0770 0.3405 6.099 1.52e-09 ***
## x1 2.6254 0.2424 10.830 < 2e-16 ***
## x2 1.2240 0.3579 3.420 0.000651 ***
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments 2 996 988.150 < 2e-16 ***
## Wu-Hausman 1 996 68.988 3.21e-16 ***
## Sargan 1 NA 0.063 0.802
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.77 on 997 degrees of freedom
## Multiple R-Squared: 0.2513, Adjusted R-squared: 0.2498
## Wald test: 63.21 on 2 and 997 DF, p-value: < 2.2e-16
Control function
summary(lm(y ~ x1 + x1_1 + x2, subset(myData,sample == 1)))
##
## Call:
## lm(formula = y ~ x1 + x1_1 + x2, data = subset(myData, sample ==
## 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.552 -7.083 0.448 6.630 33.311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0774 0.3324 6.249 6.1e-10 ***
## x1 2.6805 0.3194 8.393 < 2e-16 ***
## x1_1 1.6611 0.4008 4.144 3.7e-05 ***
## x2 1.2270 0.3495 3.510 0.000467 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.51 on 996 degrees of freedom
## Multiple R-squared: 0.2873, Adjusted R-squared: 0.2851
## F-statistic: 133.8 on 3 and 996 DF, p-value: < 2.2e-16