library(tidyverse)
library(broom)
rm(list=ls())Simulation in ‘Control Variables in Interactive Models’
In this script, I illustrate the use of controls in interacted models using simulated data. This code reproduces the simulation in DMSSW (2023) in R. The original simulations were created in Stata and code can be found here.
General housekeeping items
Let’s begin by opening libraries and clearing the environment:
Set your working directory:
setwd('C:/YOURWD')Generate ‘simulated’ data for correlation and regression
Let’s simulate the data from DMSSW. We will be using ‘randomness’ in this simulation so let’s first set a seed so that our procedure is reproducible:
set.seed(42)Next, let’s recreate the DGP from steps 1-5 for 10,000 observations bootstrapped 1,000 iterations.
for(sim in 1:1000) {
#I.a: the data generating process (DGP)
reg_data <- tibble(obs = 1:10000, #set the dataset size (Step 1)
ue = rnorm(n = 10000, mean = 0, sd = 2), #generate an unexpected earnings (ue) variable as a random draw from a normal distribution with a mean of 0 and a standard deviation of 2 (Step 2)
size = rnorm(n = 10000, mean = 8, sd = 2), #generate a company size (size) variable as a random draw from a normal distribution with a mean of 8 and a standard deviation of 2 (Step 3)
ue_size = ue * size, #create an interaction term between ue and size
dm_size = size - mean(size), #create a demeaned size variable
dm_ue_size = ue * dm_size, #create an interaction term between ue and demeaned size
car = (10 * ue) + (0 * size) + (10 * ue_size) + rnorm(n = 10000, mean = 0, sd = 100), #generate cumulative abnormal returns (Step 4)
wsj = as.numeric(size >= median(size)), #generate a Wall Street Journal coverage variable (wsj) which equals 1 for firms above the median for size and 0 otherwise (Step 5)
ue_wsj = ue * wsj) #create and interaction term between ue and wsj
#I.b: analysis
reg_1 <- lm(car ~ ue, data = reg_data) #simple ERC model
reg_2 <- lm(car ~ ue + size, data = reg_data) #adding size
reg_3 <- lm(car ~ ue + size + ue_size, data = reg_data) #adding size interacted with ue
reg_4 <- lm(car ~ ue + dm_size + dm_ue_size, data = reg_data) #respecification adding demeaned size interaction with ue
reg_5 <- lm(car ~ ue + wsj + ue_wsj, data = reg_data) #simple wsj ERC model
reg_6 <- lm(car ~ ue + wsj + ue_wsj + dm_size, data = reg_data) #adding size
reg_7 <- lm(car ~ ue + wsj + ue_wsj + dm_size + dm_ue_size, data = reg_data) #adding size interacted with ue
reg_coef <- bind_rows(tibble(tidy(reg_1), reg = '1', sim = sim),
tibble(tidy(reg_2), reg = '2', sim = sim),
tibble(tidy(reg_3), reg = '3', sim = sim),
tibble(tidy(reg_4), reg = '4', sim = sim),
tibble(tidy(reg_5), reg = '5', sim = sim),
tibble(tidy(reg_6), reg = '6', sim = sim),
tibble(tidy(reg_7), reg = '7', sim = sim))
reg_stats <- bind_rows(tibble(glance(reg_1), reg = '1', sim = sim),
tibble(glance(reg_2), reg = '2', sim = sim),
tibble(glance(reg_3), reg = '3', sim = sim),
tibble(glance(reg_4), reg = '4', sim = sim),
tibble(glance(reg_5), reg = '5', sim = sim),
tibble(glance(reg_6), reg = '6', sim = sim),
tibble(glance(reg_7), reg = '7', sim = sim))
if(sim == 1) {
simulation_coef <- reg_coef
simulation_stats <- reg_stats
} else {
simulation_coef <- bind_rows(simulation_coef, reg_coef)
simulation_stats <- bind_rows(simulation_stats, reg_stats)
}
if(sim %in% seq(0,1000,100)) {
print(paste0("Simulation ", sim, " completed at ", Sys.time()))
}
rm(list=ls(pattern="reg_"))
}[1] "Simulation 100 completed at 2023-10-18 22:34:13.206939"
[1] "Simulation 200 completed at 2023-10-18 22:34:19.950253"
[1] "Simulation 300 completed at 2023-10-18 22:34:26.620525"
[1] "Simulation 400 completed at 2023-10-18 22:34:33.213765"
[1] "Simulation 500 completed at 2023-10-18 22:34:40.056807"
[1] "Simulation 600 completed at 2023-10-18 22:34:46.798578"
[1] "Simulation 700 completed at 2023-10-18 22:34:53.814478"
[1] "Simulation 800 completed at 2023-10-18 22:35:00.574731"
[1] "Simulation 900 completed at 2023-10-18 22:35:07.466513"
[1] "Simulation 1000 completed at 2023-10-18 22:35:14.304771"
The loop above produces two datasets. The first, simulation_coef contains coefficient estimates (and related statistics) for each regression. The second, simulation_stats, contains regression statistics for each regression (7 regressions and 1,000 simulations = 7,000 observations.). We need to collapse and summarize these datasets to produce the summary tables in DMSSW Tables 1 and 2.
stats <- simulation_stats %>%
group_by(reg) %>%
summarize(value = mean(adj.r.squared)) %>%
mutate(term = "adj_r2")
coefs <- simulation_coef %>%
group_by(reg, term) %>%
summarize(coef = mean(estimate),
t_stat = mean(statistic)) %>%
pivot_longer(-c(reg, term), values_to = "value", names_to = "stat")
output <- coefs %>%
bind_rows(stats) %>%
mutate(term = factor(term, levels = c("ue", "size", "ue_size", "dm_size", "dm_ue_size", "wsj", "ue_wsj", "(Intercept)", "adj_r2"), ordered = TRUE)) %>%
pivot_wider(values_from = value, names_from = reg, names_prefix = "reg_") %>%
arrange(term)Store the output in a csv file in working directory.
write_csv(output, "DMSSW2023.csv") #note will save to local working directory
knitr::kable(output, format = "html") | term | stat | reg_1 | reg_2 | reg_3 | reg_4 | reg_5 | reg_6 | reg_7 |
|---|---|---|---|---|---|---|---|---|
| ue | coef | 90.0048975 | 90.0046173 | 9.9572078 | 89.9950492 | 74.0206883 | 74.0223435 | 89.9663657 |
| ue | t_stat | 167.1078108 | 167.1014929 | 4.8282004 | 179.9493141 | 101.7391039 | 101.7375214 | 92.8571215 |
| size | coef | NA | -0.0152889 | -0.0185417 | NA | NA | NA | NA |
| size | t_stat | NA | -0.0279795 | -0.0370293 | NA | NA | NA | NA |
| ue_size | coef | NA | NA | 10.0036385 | NA | NA | NA | NA |
| ue_size | t_stat | NA | NA | 40.0125474 | NA | NA | NA | NA |
| dm_size | coef | NA | NA | NA | -0.0185417 | NA | -0.0078409 | -0.0134705 |
| dm_size | t_stat | NA | NA | NA | -0.0370293 | NA | -0.0088896 | -0.0162587 |
| dm_ue_size | coef | NA | NA | NA | 10.0036385 | NA | NA | 9.9916684 |
| dm_ue_size | t_stat | NA | NA | NA | 40.0125474 | NA | NA | 24.0836662 |
| wsj | coef | NA | NA | NA | NA | -0.0584856 | -0.0328896 | -0.0252062 |
| wsj | t_stat | NA | NA | NA | NA | -0.0283121 | -0.0098089 | -0.0076185 |
| ue_wsj | coef | NA | NA | NA | NA | 31.9598196 | 31.9570372 | 0.0586305 |
| ue_wsj | t_stat | NA | NA | NA | NA | 31.0639645 | 31.0599732 | 0.0348556 |
| (Intercept) | coef | -0.0145277 | 0.1081932 | 0.1339072 | -0.0146057 | 0.0166371 | 0.0038436 | -0.0018672 |
| (Intercept) | t_stat | -0.0133963 | 0.0238887 | 0.0323807 | -0.0145605 | 0.0114868 | 0.0021859 | -0.0009175 |
| adj_r2 | NA | 0.7362854 | 0.7362950 | 0.7726912 | 0.7726912 | 0.7594975 | 0.7595025 | 0.7726923 |