Simulation in ‘Control Variables in Interactive Models’

In this script, I illustrate the use of controls in interacted models using simulated data. This code reproduces the simulation in DMSSW (2023) in R. The original simulations were created in Stata and code can be found here.

General housekeeping items

Let’s begin by opening libraries and clearing the environment:

library(tidyverse)
library(broom)

rm(list=ls())

Set your working directory:

setwd('C:/YOURWD')

Generate ‘simulated’ data for correlation and regression

Let’s simulate the data from DMSSW. We will be using ‘randomness’ in this simulation so let’s first set a seed so that our procedure is reproducible:

set.seed(42)

Next, let’s recreate the DGP from steps 1-5 for 10,000 observations bootstrapped 1,000 iterations.

for(sim in 1:1000) {

  #I.a: the data generating process (DGP)
  reg_data <- tibble(obs = 1:10000, #set the dataset size (Step 1)
                     ue = rnorm(n = 10000, mean = 0, sd = 2), #generate an unexpected earnings (ue) variable as a random draw from a normal distribution with a mean of 0 and a standard deviation of 2 (Step 2)
                     size = rnorm(n = 10000, mean = 8, sd = 2), #generate a company size (size) variable as a random draw from a normal distribution with a mean of 8 and a standard deviation of 2 (Step 3)
                     ue_size = ue * size, #create an interaction term between ue and size 
                     dm_size = size - mean(size), #create a demeaned size variable 
                     dm_ue_size = ue * dm_size, #create an interaction term between ue and demeaned size 
                     car = (10 * ue) + (0 * size) + (10 * ue_size) + rnorm(n = 10000, mean =  0, sd = 100), #generate cumulative abnormal returns (Step 4)
                     wsj = as.numeric(size >= median(size)), #generate a Wall Street Journal coverage variable (wsj) which equals 1 for firms above the median for size and 0 otherwise (Step 5)
                     ue_wsj = ue * wsj) #create and interaction term between ue and wsj
  
  #I.b: analysis    
  reg_1 <- lm(car ~ ue, data = reg_data) #simple ERC model
  reg_2 <- lm(car ~ ue + size, data = reg_data) #adding size
  reg_3 <- lm(car ~ ue + size + ue_size, data = reg_data) #adding size interacted with ue
  reg_4 <- lm(car ~ ue + dm_size + dm_ue_size, data = reg_data) #respecification adding demeaned size interaction with ue
  reg_5 <- lm(car ~ ue + wsj + ue_wsj, data = reg_data) #simple wsj ERC model
  reg_6 <- lm(car ~ ue + wsj + ue_wsj + dm_size, data = reg_data) #adding size
  reg_7 <- lm(car ~ ue + wsj + ue_wsj + dm_size + dm_ue_size, data = reg_data) #adding size interacted with ue
                   
  reg_coef <- bind_rows(tibble(tidy(reg_1), reg = '1', sim = sim),
                        tibble(tidy(reg_2), reg = '2', sim = sim),
                        tibble(tidy(reg_3), reg = '3', sim = sim), 
                        tibble(tidy(reg_4), reg = '4', sim = sim),
                        tibble(tidy(reg_5), reg = '5', sim = sim),
                        tibble(tidy(reg_6), reg = '6', sim = sim),
                        tibble(tidy(reg_7), reg = '7', sim = sim))
  
  reg_stats <- bind_rows(tibble(glance(reg_1), reg = '1', sim = sim),
                         tibble(glance(reg_2), reg = '2', sim = sim),
                         tibble(glance(reg_3), reg = '3', sim = sim), 
                         tibble(glance(reg_4), reg = '4', sim = sim),
                         tibble(glance(reg_5), reg = '5', sim = sim),
                         tibble(glance(reg_6), reg = '6', sim = sim),
                         tibble(glance(reg_7), reg = '7', sim = sim))
  
  if(sim == 1) {
    simulation_coef <- reg_coef 
    simulation_stats <- reg_stats
  } else {
    simulation_coef <- bind_rows(simulation_coef, reg_coef)
    simulation_stats <- bind_rows(simulation_stats, reg_stats)
  }
  
  if(sim %in% seq(0,1000,100)) { 
  print(paste0("Simulation ", sim, " completed at ", Sys.time()))
  }
  
  rm(list=ls(pattern="reg_"))
}

[1] "Simulation 100 completed at 2023-10-18 22:34:13.206939"
[1] "Simulation 200 completed at 2023-10-18 22:34:19.950253"
[1] "Simulation 300 completed at 2023-10-18 22:34:26.620525"
[1] "Simulation 400 completed at 2023-10-18 22:34:33.213765"
[1] "Simulation 500 completed at 2023-10-18 22:34:40.056807"
[1] "Simulation 600 completed at 2023-10-18 22:34:46.798578"
[1] "Simulation 700 completed at 2023-10-18 22:34:53.814478"
[1] "Simulation 800 completed at 2023-10-18 22:35:00.574731"
[1] "Simulation 900 completed at 2023-10-18 22:35:07.466513"
[1] "Simulation 1000 completed at 2023-10-18 22:35:14.304771"

The loop above produces two datasets. The first, simulation_coef contains coefficient estimates (and related statistics) for each regression. The second, simulation_stats, contains regression statistics for each regression (7 regressions and 1,000 simulations = 7,000 observations.). We need to collapse and summarize these datasets to produce the summary tables in DMSSW Tables 1 and 2.

stats <- simulation_stats %>% 
  group_by(reg) %>% 
  summarize(value = mean(adj.r.squared)) %>%
  mutate(term = "adj_r2")

coefs <- simulation_coef %>% 
  group_by(reg, term) %>% 
  summarize(coef = mean(estimate),
            t_stat = mean(statistic)) %>%
  pivot_longer(-c(reg, term), values_to = "value", names_to = "stat") 

output <- coefs %>%
  bind_rows(stats) %>% 
  mutate(term = factor(term, levels = c("ue", "size", "ue_size", "dm_size", "dm_ue_size", "wsj", "ue_wsj", "(Intercept)", "adj_r2"), ordered = TRUE)) %>% 
  pivot_wider(values_from = value, names_from = reg, names_prefix = "reg_") %>% 
  arrange(term)

Store the output in a csv file in working directory.

write_csv(output, "DMSSW2023.csv") #note will save to local working directory
knitr::kable(output, format = "html")

term	stat	reg_1	reg_2	reg_3	reg_4	reg_5	reg_6	reg_7
ue	coef	90.0048975	90.0046173	9.9572078	89.9950492	74.0206883	74.0223435	89.9663657
ue	t_stat	167.1078108	167.1014929	4.8282004	179.9493141	101.7391039	101.7375214	92.8571215
size	coef	NA	-0.0152889	-0.0185417	NA	NA	NA	NA
size	t_stat	NA	-0.0279795	-0.0370293	NA	NA	NA	NA
ue_size	coef	NA	NA	10.0036385	NA	NA	NA	NA
ue_size	t_stat	NA	NA	40.0125474	NA	NA	NA	NA
dm_size	coef	NA	NA	NA	-0.0185417	NA	-0.0078409	-0.0134705
dm_size	t_stat	NA	NA	NA	-0.0370293	NA	-0.0088896	-0.0162587
dm_ue_size	coef	NA	NA	NA	10.0036385	NA	NA	9.9916684
dm_ue_size	t_stat	NA	NA	NA	40.0125474	NA	NA	24.0836662
wsj	coef	NA	NA	NA	NA	-0.0584856	-0.0328896	-0.0252062
wsj	t_stat	NA	NA	NA	NA	-0.0283121	-0.0098089	-0.0076185
ue_wsj	coef	NA	NA	NA	NA	31.9598196	31.9570372	0.0586305
ue_wsj	t_stat	NA	NA	NA	NA	31.0639645	31.0599732	0.0348556
(Intercept)	coef	-0.0145277	0.1081932	0.1339072	-0.0146057	0.0166371	0.0038436	-0.0018672
(Intercept)	t_stat	-0.0133963	0.0238887	0.0323807	-0.0145605	0.0114868	0.0021859	-0.0009175
adj_r2	NA	0.7362854	0.7362950	0.7726912	0.7726912	0.7594975	0.7595025	0.7726923