Data-Driven Heterogeneity
- RATE
- RATE output

Setup

Package loading

options(warn=-1)
options(scipen=999)
suppressMessages(library(estimatr))
suppressMessages(library(stats))
suppressMessages(library(tidyverse))
suppressMessages(library(rmarkdown))
suppressMessages(library(tictoc))
suppressMessages(library(stargazer))
suppressMessages(library(ggpattern))
suppressMessages(library(doParallel))
suppressMessages(library(mltools))
suppressMessages(library(data.table))
suppressMessages(library(RItools))
suppressMessages(library(stringr))
suppressMessages(library(grf))
suppressMessages(library(multcomp))
suppressMessages(library(msm))
suppressMessages(library(lmtest))
suppressMessages(library(fixest))
suppressMessages(library(sandwich))
suppressMessages(library(kableExtra))
nrCores = detectCores()
cl = makeCluster(nrCores)
registerDoParallel(cl)

Bring necessary packages to clusters for parallel computing

## [[1]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[6]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[7]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[8]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[9]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[10]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[11]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"

Data loading and cleaning

data = read_rds("../Cleaned_data/intermediate_data_wide.rds")
wide <- data

# set up treatment variables for each course
wide$treatmentEmotions <- ifelse(wide$treatment == "Emotions", 1, 0)
wide$treatmentReasoning <- ifelse(wide$treatment == "Reasoning", 1, 0)
wide$treatmentCombo <- ifelse(wide$treatment == "Combo", 1, 0)
wide$treatmentFactsBaseline <- ifelse(wide$treatment == "Facts Baseline", 1, 0)

# Create vectors with names and display names of covariates by type to use later #
covariates_continuous = c("age","social_media_hours","base_rate_pre","base_total_acc_score_pre",
                          "misinfo_pre","misinfo_total_acc_score_pre")
covariates_categorical = c("gender_Man","education_High_school_or_less","marital_Married_or_in_a_domestic_partnership", "employment_Employed", "location_Mostly_urban","religion_Christian","religiosity_Attends","social_media_share_above_40","att_check_pre")

covariates <- c("age","gender_Man", "education_High_school_or_less",
                "education_Some_college","education_Bachelor_degree","education_Graduate_degree",
                "marital_Married_or_in_a_domestic_partnership",
                "employment_Unemployed","employment_Employed","employment_Student",
                "location_Mostly_urban","location_Suburban","location_Mostly_rural",
                "religion_Christian","religiosity_Attends",
                "social_media_bin_Yes","social_media_hours",
                "social_media_share_0_20",
                "social_media_share_20_40",
                "social_media_share_40_60",
                "social_media_share_60_80",
                "social_media_share_80_100")

covariates_sharing <- c("misinfo_pre","base_rate_pre")
covariates_accuracy <- c("misinfo_total_acc_score_pre","base_total_acc_score_pre")

covariates_all <- c(covariates,covariates_sharing,covariates_accuracy)

covariates_continuous_fancy = c("Age","Hrs/day on social media",
                                "All posts (Sharing)",
                                "Pre-Survey Misinformation posts (Sharing)",
                                "Pre-Survey Non-misinformation posts (Sharing)",
                                "All posts (Accuracy Score)",
                                "Pre-Survey Misinformation posts (Accuracy Score)",
                                "Pre-Survey Non-misinformation posts (Accuracy Score)")

covariates_binary_fancy = c("Man",
                            "High school or less",
                            "Some college",
                            "Bachelor\'s degree",
                            "Graduate degree",
                            "Married",
                            "Unemployed",
                            "Employed",
                            "Student",
                            "Mostly urban","Suburban","Mostly rural",
                            "Christian",
                            "Attends religious services",
                            "Uses social media",
                            "Social media sharing behavior 0-20%",
                            "Social media sharing behavior 20-40%",
                            "Social media sharing behavior 40-60%",
                            "Social media sharing behavior 60-80%",
                            "Social media sharing behavior 80-100%")
covariates_all_fancy = c("Age",covariates_binary_fancy[!endsWith(covariates_binary_fancy,"%")],"Hrs/day on social media",
                         covariates_binary_fancy[endsWith(covariates_binary_fancy,"%")],
                         covariates_continuous_fancy[c(4, 5, 7, 8)])

# generate mapping between covariates_all and covariates_all_fancy
variable_mapping <- covariates_all_fancy
names(variable_mapping) <- covariates_all

Data-Driven Heterogeneity

This script primarily focuses on: Rank-Weighted Average Treatment Effect

We provide the following comparison.

Reasoning vs Facts Baseline
Emotions vs Facts Baseline
Combo vs Facts Baseline
Reasoning vs Emotions
Reasoning vs Combo
Emotions vs Combo

Given the priority is replicability, we explicitly use the default parameters in all causal forests specification.

sample.fraction = 0.5
min.node.size = 5
honesty.fraction = 0.5
mtry = 26
honesty.prune.leaves = 1
alpha = 0.05
imbalance.penalty = 0
num.tree = 2000
tune.num.trees = 200

RATE

run_RATE <- function(outcome, treatment, wide_treatment){
# Set tuning parameters
# num.trees <- 3000
# mtry <- 50
# honesty.prune.leaves <- FALSE
# tune.parameters <- c("sample.fraction", "min.node.size", "honesty.fraction")
# tune.num.trees <- 300

set.seed(777)
RATEs <- list()
Parameters <- list()
Pvals <- list()
# Loop over all outcomes Y
for (outcome in outcomes) {
  # Print Outcome and Covariates at the top of each loop
  print(paste("Outcome:", outcome))
  
  
  # Define Y to be the outcome
  Y <- wide_treatment[,outcome]
  
  # Define X to be the matrix 
  fmla <- formula(paste0("~ 0 + ", paste0(covariates_all, collapse="+")))
  X <- model.matrix(fmla, wide_treatment)
  
  # Define W to be the treatment
  W <- wide_treatment[,treatment]
    
  # Train a causal forest to estimate a CATE based priority ranking
  train <- sample(1:nrow(X), nrow(X) / 2)
  cf.priority <- causal_forest(X[train,], Y[train], W[train], seed = 777, num.trees = 2000,
                               sample.fraction = 0.5,
                               min.node.size = 5,
                               honesty.fraction = 0.5,
                               mtry = 26,
                               honesty.prune.leaves = 1,
                               alpha = 0.05,
                               imbalance.penalty = 0,
                               tune.num.trees = 200)

  tuned.parameters <- cf.priority$tunable.params
  Parameters[[outcome]] <- tuned.parameters
  
  # Compute a prioritization based on estimated treatment effects.
  priority.cate <- predict(cf.priority, X[-train,])$predictions
  
  # Estimate AUTOC on held out data using tuned parameters from the priority model
  cf.eval <- causal_forest(X[-train, ], Y[-train], W[-train], seed = 777, num.trees = 2000,
                               sample.fraction = 0.5,
                               min.node.size = 5,
                               honesty.fraction = 0.5,
                               mtry = 26,
                               honesty.prune.leaves = 1,
                               alpha = 0.05,
                               imbalance.penalty = 0,
                               tune.num.trees = 200)

  # Estimate AUTOC
  rate <- rank_average_treatment_effect(cf.eval, priority.cate)
  RATEs[[outcome]] <- rate
  
  t.stat = rate$estimate / rate$std.err
  # Compute a two-sided p-value Pr(>|t|)
  p.val = 2 * pnorm(-abs(t.stat))
  Pvals[[outcome]] <- p.val

  # Plot the Targeting Operator Characteristic curve
  rate_plot <- plot(rate, las=1)
  print(rate_plot)

  # Add title to plot "TOC Curve for outcome"
  title(paste("TOC Curve for", outcome), line = -2, cex.main = 0.8)
}
  return(list(RATEs = RATEs, Parameters = Parameters, Pvals = Pvals))
}

outcomes <- c("misinfo_post", "base_rate_post", "new_disc_post")

Reasoning vs Facts Baseline

rate_RvF <- run_RATE(outcomes, "treatmentReasoning", wide %>% filter(treatment %in% c("Reasoning", "Facts Baseline")))

## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

Emotions vs Facts Baseline

rate_EvF <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Emotions", "Facts Baseline")))

## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

Combo vs Facts Baseline

rate_CvF <- run_RATE(outcomes, "treatmentCombo", wide %>% filter(treatment %in% c("Combo", "Facts Baseline")))

## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

Emotions vs Reasoning

rate_EvR <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Reasoning", "Emotions")))

## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

Emotions vs Combo

rate_EvC <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Emotions", "Combo")))

## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

RATE output

RATE is done by sample splitting, i.e. using half of the datasets to fit a causal forest and the other half as evaluation dataset to generate RATE.

The RATE outputs the estimates, standard errors, and p-values for the RATEs for each outcome and comparison combination.

Note that numbers within () are standard errors, and numbers within [] are two sided p-values.

# organizing table
RATE_output <- matrix(NA, nrow = 15, ncol = 4)

RATE_output[1, ] <- c("Reasoning vs Facts Baseline", rate_RvF$RATEs$misinfo_post$estimate, rate_RvF$RATEs$base_rate_post$estimate, rate_RvF$RATEs$new_disc_post$estimate)
RATE_output[2, ]  <- c("", rate_RvF$RATEs$misinfo_post$std.err, rate_RvF$RATEs$base_rate_post$std.err, rate_RvF$RATEs$new_disc_post$std.err)
RATE_output[3, ] <- c("", rate_RvF$Pvals$misinfo_post, rate_RvF$Pvals$base_rate_post, rate_RvF$Pvals$new_disc_post)
RATE_output[4, ] <- c("Emotions vs Facts Baseline", rate_EvF$RATEs$misinfo_post$estimate, rate_EvF$RATEs$base_rate_post$estimate, rate_EvF$RATEs$new_disc_post$estimate)
RATE_output[5, ]  <- c(" ", rate_EvF$RATEs$misinfo_post$std.err, rate_EvF$RATEs$base_rate_post$std.err, rate_EvF$RATEs$new_disc_post$std.err)
RATE_output[6, ] <- c(" ", rate_EvF$Pvals$misinfo_post, rate_EvF$Pvals$base_rate_post, rate_EvF$Pvals$new_disc_post)
RATE_output[7, ] <- c("Combo vs Facts Baseline", rate_CvF$RATEs$misinfo_post$estimate, rate_CvF$RATEs$base_rate_post$estimate, rate_CvF$RATEs$new_disc_post$estimate)
RATE_output[8, ]  <- c("  ", rate_CvF$RATEs$misinfo_post$std.err, rate_CvF$RATEs$base_rate_post$std.err, rate_CvF$RATEs$new_disc_post$std.err)
RATE_output[9, ] <- c(" ", rate_CvF$Pvals$misinfo_post, rate_CvF$Pvals$base_rate_post, rate_CvF$Pvals$new_disc_post)
RATE_output[10, ] <- c("Emotions vs Reasoning", rate_EvR$RATEs$misinfo_post$estimate, rate_EvR$RATEs$base_rate_post$estimate, rate_EvR$RATEs$new_disc_post$estimate)
RATE_output[11, ]  <- c("   ", rate_EvR$RATEs$misinfo_post$std.err, rate_EvR$RATEs$base_rate_post$std.err, rate_EvR$RATEs$new_disc_post$std.err)
RATE_output[12, ] <- c("   ", rate_EvR$Pvals$misinfo_post, rate_EvR$Pvals$base_rate_post, rate_EvR$Pvals$new_disc_post)
RATE_output[13, ] <- c("Emotions vs Combo", rate_EvC$RATEs$misinfo_post$estimate, rate_EvC$RATEs$base_rate_post$estimate, rate_EvC$RATEs$new_disc_post$estimate)
RATE_output[14, ]  <- c("     ", rate_EvC$RATEs$misinfo_post$std.err, rate_EvC$RATEs$base_rate_post$std.err, rate_EvC$RATEs$new_disc_post$std.err)
RATE_output[15, ] <- c("     ", rate_EvC$Pvals$misinfo_post, rate_EvC$Pvals$base_rate_post, rate_EvC$Pvals$new_disc_post)

# round to four digits for column 2 to 4 
RATE_output[, 2:4] <- round(as.numeric(RATE_output[, 2:4]), 4)
colnames(RATE_output) <- c("Comparison", "Misinformation Sharing", "Non-misinformation Sharing", "Sharing Discernment")

# round to 4 digits row 1, 4, 7, 10, 13
RATE_output[(1:5) * 3 - 2, 2:4] <- sapply(RATE_output[(1:5) * 3 - 2, 2:4], function(x) round(as.numeric(x), 4))
# for row 2, 5, 8, 11, 14, add () to the values
RATE_output[(1:5) * 3 - 1, 2:4] <- sapply(RATE_output[(1:5) * 3 - 1, 2:4], function(x) paste0("(", round(as.numeric(x), 4), ")"))
# for row 3, 6, 9, 12, 15, add [] to the values
RATE_output[(1:5) * 3, 2:4] <- sapply(RATE_output[(1:5) * 3, 2:4], function(x) paste0("[", round(as.numeric(x), 4), "]"))
# saveRDS(RATE_output, "../Cleaned_data/7_RATE_output.RDS")

RATE_output <- as.data.frame(RATE_output)
kable(RATE_output, format = "html", digits = 4) %>% 
  column_spec(1, bold = TRUE) %>%
  column_spec(2:4, background = "#f7f7f7") %>%
  kable_styling(position = "left", latex_options = c("striped", "hold_position"))

Comparison	Misinformation Sharing	Non-misinformation Sharing	Sharing Discernment
Reasoning vs Facts Baseline	0.0478	0.0199	0.0193
	(0.0121)	(0.0153)	(0.0133)
	[0.0001]	[0.1931]	[0.1473]
Emotions vs Facts Baseline	0.06	0.0279	0.0137
	(0.0133)	(0.013)	(0.014)
	[0]	[0.032]	[0.3298]
Combo vs Facts Baseline	0.0264	0.0193	0.0059
	(0.0129)	(0.0147)	(0.0162)
	[0.0403]	[0.1896]	[0.7175]
Emotions vs Reasoning	0.0238	0.0116	-0.01
	(0.0131)	(0.016)	(0.0148)
	[0.0687]	[0.4713]	[0.4995]
Emotions vs Combo	-0.0101	0.0063	0.0116
	(0.0132)	(0.0148)	(0.0144)
	[0.4442]	[0.6702]	[0.4202]

Subgroup Analysis