Simulation in ‘Control Variables in Interactive Models’

In this script, I illustrate the use of controls in interacted models using simulated data. This code reproduces the simulation in DMSSW (2023) in R. The original simulations were created in Stata and code can be found here.

General housekeeping items

Let’s begin by opening libraries and clearing the environment:

library(tidyverse)
library(broom)

rm(list=ls())

Set your working directory:

setwd('C:/YOURWD')

Generate ‘simulated’ data for correlation and regression

Let’s simulate the data from DMSSW. We will be using ‘randomness’ in this simulation so let’s first set a seed so that our procedure is reproducible:

set.seed(42)

Next, let’s recreate the DGP from steps 1-5 for 10,000 observations bootstrapped 1,000 iterations.

for(sim in 1:1000) {

  #I.a: the data generating process (DGP)
  reg_data <- tibble(obs = 1:10000, #set the dataset size (Step 1)
                     ue = rnorm(n = 10000, mean = 0, sd = 2), #generate an unexpected earnings (ue) variable as a random draw from a normal distribution with a mean of 0 and a standard deviation of 2 (Step 2)
                     size = rnorm(n = 10000, mean = 8, sd = 2), #generate a company size (size) variable as a random draw from a normal distribution with a mean of 8 and a standard deviation of 2 (Step 3)
                     ue_size = ue * size, #create an interaction term between ue and size 
                     dm_size = size - mean(size), #create a demeaned size variable 
                     dm_ue_size = ue * dm_size, #create an interaction term between ue and demeaned size 
                     car = (10 * ue) + (0 * size) + (10 * ue_size) + rnorm(n = 10000, mean =  0, sd = 100), #generate cumulative abnormal returns (Step 4)
                     wsj = as.numeric(size >= median(size)), #generate a Wall Street Journal coverage variable (wsj) which equals 1 for firms above the median for size and 0 otherwise (Step 5)
                     ue_wsj = ue * wsj) #create and interaction term between ue and wsj
  
  #I.b: analysis    
  reg_1 <- lm(car ~ ue, data = reg_data) #simple ERC model
  reg_2 <- lm(car ~ ue + size, data = reg_data) #adding size
  reg_3 <- lm(car ~ ue + size + ue_size, data = reg_data) #adding size interacted with ue
  reg_4 <- lm(car ~ ue + dm_size + dm_ue_size, data = reg_data) #respecification adding demeaned size interaction with ue
  reg_5 <- lm(car ~ ue + wsj + ue_wsj, data = reg_data) #simple wsj ERC model
  reg_6 <- lm(car ~ ue + wsj + ue_wsj + dm_size, data = reg_data) #adding size
  reg_7 <- lm(car ~ ue + wsj + ue_wsj + dm_size + dm_ue_size, data = reg_data) #adding size interacted with ue
                   
  reg_coef <- bind_rows(tibble(tidy(reg_1), reg = '1', sim = sim),
                        tibble(tidy(reg_2), reg = '2', sim = sim),
                        tibble(tidy(reg_3), reg = '3', sim = sim), 
                        tibble(tidy(reg_4), reg = '4', sim = sim),
                        tibble(tidy(reg_5), reg = '5', sim = sim),
                        tibble(tidy(reg_6), reg = '6', sim = sim),
                        tibble(tidy(reg_7), reg = '7', sim = sim))
  
  reg_stats <- bind_rows(tibble(glance(reg_1), reg = '1', sim = sim),
                         tibble(glance(reg_2), reg = '2', sim = sim),
                         tibble(glance(reg_3), reg = '3', sim = sim), 
                         tibble(glance(reg_4), reg = '4', sim = sim),
                         tibble(glance(reg_5), reg = '5', sim = sim),
                         tibble(glance(reg_6), reg = '6', sim = sim),
                         tibble(glance(reg_7), reg = '7', sim = sim))
  
  if(sim == 1) {
    simulation_coef <- reg_coef 
    simulation_stats <- reg_stats
  } else {
    simulation_coef <- bind_rows(simulation_coef, reg_coef)
    simulation_stats <- bind_rows(simulation_stats, reg_stats)
  }
  
  if(sim %in% seq(0,1000,100)) { 
  print(paste0("Simulation ", sim, " completed at ", Sys.time()))
  }
  
  rm(list=ls(pattern="reg_"))
}
[1] "Simulation 100 completed at 2023-10-18 22:34:13.206939"
[1] "Simulation 200 completed at 2023-10-18 22:34:19.950253"
[1] "Simulation 300 completed at 2023-10-18 22:34:26.620525"
[1] "Simulation 400 completed at 2023-10-18 22:34:33.213765"
[1] "Simulation 500 completed at 2023-10-18 22:34:40.056807"
[1] "Simulation 600 completed at 2023-10-18 22:34:46.798578"
[1] "Simulation 700 completed at 2023-10-18 22:34:53.814478"
[1] "Simulation 800 completed at 2023-10-18 22:35:00.574731"
[1] "Simulation 900 completed at 2023-10-18 22:35:07.466513"
[1] "Simulation 1000 completed at 2023-10-18 22:35:14.304771"

The loop above produces two datasets. The first, simulation_coef contains coefficient estimates (and related statistics) for each regression. The second, simulation_stats, contains regression statistics for each regression (7 regressions and 1,000 simulations = 7,000 observations.). We need to collapse and summarize these datasets to produce the summary tables in DMSSW Tables 1 and 2.

stats <- simulation_stats %>% 
  group_by(reg) %>% 
  summarize(value = mean(adj.r.squared)) %>%
  mutate(term = "adj_r2")

coefs <- simulation_coef %>% 
  group_by(reg, term) %>% 
  summarize(coef = mean(estimate),
            t_stat = mean(statistic)) %>%
  pivot_longer(-c(reg, term), values_to = "value", names_to = "stat") 

output <- coefs %>%
  bind_rows(stats) %>% 
  mutate(term = factor(term, levels = c("ue", "size", "ue_size", "dm_size", "dm_ue_size", "wsj", "ue_wsj", "(Intercept)", "adj_r2"), ordered = TRUE)) %>% 
  pivot_wider(values_from = value, names_from = reg, names_prefix = "reg_") %>% 
  arrange(term)

Store the output in a csv file in working directory.

write_csv(output, "DMSSW2023.csv") #note will save to local working directory
knitr::kable(output, format = "html") 
term stat reg_1 reg_2 reg_3 reg_4 reg_5 reg_6 reg_7
ue coef 90.0048975 90.0046173 9.9572078 89.9950492 74.0206883 74.0223435 89.9663657
ue t_stat 167.1078108 167.1014929 4.8282004 179.9493141 101.7391039 101.7375214 92.8571215
size coef NA -0.0152889 -0.0185417 NA NA NA NA
size t_stat NA -0.0279795 -0.0370293 NA NA NA NA
ue_size coef NA NA 10.0036385 NA NA NA NA
ue_size t_stat NA NA 40.0125474 NA NA NA NA
dm_size coef NA NA NA -0.0185417 NA -0.0078409 -0.0134705
dm_size t_stat NA NA NA -0.0370293 NA -0.0088896 -0.0162587
dm_ue_size coef NA NA NA 10.0036385 NA NA 9.9916684
dm_ue_size t_stat NA NA NA 40.0125474 NA NA 24.0836662
wsj coef NA NA NA NA -0.0584856 -0.0328896 -0.0252062
wsj t_stat NA NA NA NA -0.0283121 -0.0098089 -0.0076185
ue_wsj coef NA NA NA NA 31.9598196 31.9570372 0.0586305
ue_wsj t_stat NA NA NA NA 31.0639645 31.0599732 0.0348556
(Intercept) coef -0.0145277 0.1081932 0.1339072 -0.0146057 0.0166371 0.0038436 -0.0018672
(Intercept) t_stat -0.0133963 0.0238887 0.0323807 -0.0145605 0.0114868 0.0021859 -0.0009175
adj_r2 NA 0.7362854 0.7362950 0.7726912 0.7726912 0.7594975 0.7595025 0.7726923