Setup

Package loading

options(warn=-1)
options(scipen=999)
suppressMessages(library(estimatr))
suppressMessages(library(stats))
suppressMessages(library(tidyverse))
suppressMessages(library(rmarkdown))
suppressMessages(library(tictoc))
suppressMessages(library(stargazer))
suppressMessages(library(ggpattern))
suppressMessages(library(doParallel))
suppressMessages(library(mltools))
suppressMessages(library(data.table))
suppressMessages(library(RItools))
suppressMessages(library(stringr))
suppressMessages(library(grf))
suppressMessages(library(multcomp))
suppressMessages(library(msm))
suppressMessages(library(lmtest))
suppressMessages(library(fixest))
suppressMessages(library(sandwich))
suppressMessages(library(kableExtra))
nrCores = detectCores()
cl = makeCluster(nrCores)
registerDoParallel(cl)

Bring necessary packages to clusters for parallel computing

## [[1]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[6]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[7]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[8]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[9]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[10]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[11]]
##  [1] "sandwich"  "fixest"    "lmtest"    "zoo"       "msm"       "multcomp" 
##  [7] "TH.data"   "MASS"      "survival"  "mvtnorm"   "rmarkdown" "lubridate"
## [13] "forcats"   "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"    
## [19] "tibble"    "ggplot2"   "tidyverse" "estimatr"  "stats"     "graphics" 
## [25] "grDevices" "utils"     "datasets"  "methods"   "base"

Data loading and cleaning

data = read_rds("../Cleaned_data/intermediate_data_wide.rds")
wide <- data

# set up treatment variables for each course
wide$treatmentEmotions <- ifelse(wide$treatment == "Emotions", 1, 0)
wide$treatmentReasoning <- ifelse(wide$treatment == "Reasoning", 1, 0)
wide$treatmentCombo <- ifelse(wide$treatment == "Combo", 1, 0)
wide$treatmentFactsBaseline <- ifelse(wide$treatment == "Facts Baseline", 1, 0)
# Create vectors with names and display names of covariates by type to use later #
covariates_continuous = c("age","social_media_hours","base_rate_pre","base_total_acc_score_pre",
                          "misinfo_pre","misinfo_total_acc_score_pre")
covariates_categorical = c("gender_Man","education_High_school_or_less","marital_Married_or_in_a_domestic_partnership", "employment_Employed", "location_Mostly_urban","religion_Christian","religiosity_Attends","social_media_share_above_40","att_check_pre")

covariates <- c("age","gender_Man", "education_High_school_or_less",
                "education_Some_college","education_Bachelor_degree","education_Graduate_degree",
                "marital_Married_or_in_a_domestic_partnership",
                "employment_Unemployed","employment_Employed","employment_Student",
                "location_Mostly_urban","location_Suburban","location_Mostly_rural",
                "religion_Christian","religiosity_Attends",
                "social_media_bin_Yes","social_media_hours",
                "social_media_share_0_20",
                "social_media_share_20_40",
                "social_media_share_40_60",
                "social_media_share_60_80",
                "social_media_share_80_100")

covariates_sharing <- c("misinfo_pre","base_rate_pre")
covariates_accuracy <- c("misinfo_total_acc_score_pre","base_total_acc_score_pre")

covariates_all <- c(covariates,covariates_sharing,covariates_accuracy)

covariates_continuous_fancy = c("Age","Hrs/day on social media",
                                "All posts (Sharing)",
                                "Pre-Survey Misinformation posts (Sharing)",
                                "Pre-Survey Non-misinformation posts (Sharing)",
                                "All posts (Accuracy Score)",
                                "Pre-Survey Misinformation posts (Accuracy Score)",
                                "Pre-Survey Non-misinformation posts (Accuracy Score)")

covariates_binary_fancy = c("Man",
                            "High school or less",
                            "Some college",
                            "Bachelor\'s degree",
                            "Graduate degree",
                            "Married",
                            "Unemployed",
                            "Employed",
                            "Student",
                            "Mostly urban","Suburban","Mostly rural",
                            "Christian",
                            "Attends religious services",
                            "Uses social media",
                            "Social media sharing behavior 0-20%",
                            "Social media sharing behavior 20-40%",
                            "Social media sharing behavior 40-60%",
                            "Social media sharing behavior 60-80%",
                            "Social media sharing behavior 80-100%")
covariates_all_fancy = c("Age",covariates_binary_fancy[!endsWith(covariates_binary_fancy,"%")],"Hrs/day on social media",
                         covariates_binary_fancy[endsWith(covariates_binary_fancy,"%")],
                         covariates_continuous_fancy[c(4, 5, 7, 8)])

# generate mapping between covariates_all and covariates_all_fancy
variable_mapping <- covariates_all_fancy
names(variable_mapping) <- covariates_all

Data-Driven Heterogeneity

This script primarily focuses on: Rank-Weighted Average Treatment Effect

We provide the following comparison.

Given the priority is replicability, we explicitly use the default parameters in all causal forests specification.

RATE

run_RATE <- function(outcome, treatment, wide_treatment){
# Set tuning parameters
# num.trees <- 3000
# mtry <- 50
# honesty.prune.leaves <- FALSE
# tune.parameters <- c("sample.fraction", "min.node.size", "honesty.fraction")
# tune.num.trees <- 300

set.seed(777)
RATEs <- list()
Parameters <- list()
Pvals <- list()
# Loop over all outcomes Y
for (outcome in outcomes) {
  # Print Outcome and Covariates at the top of each loop
  print(paste("Outcome:", outcome))
  
  
  # Define Y to be the outcome
  Y <- wide_treatment[,outcome]
  
  # Define X to be the matrix 
  fmla <- formula(paste0("~ 0 + ", paste0(covariates_all, collapse="+")))
  X <- model.matrix(fmla, wide_treatment)
  
  # Define W to be the treatment
  W <- wide_treatment[,treatment]
    
  # Train a causal forest to estimate a CATE based priority ranking
  train <- sample(1:nrow(X), nrow(X) / 2)
  cf.priority <- causal_forest(X[train,], Y[train], W[train], seed = 777, num.trees = 2000,
                               sample.fraction = 0.5,
                               min.node.size = 5,
                               honesty.fraction = 0.5,
                               mtry = 26,
                               honesty.prune.leaves = 1,
                               alpha = 0.05,
                               imbalance.penalty = 0,
                               tune.num.trees = 200)

  tuned.parameters <- cf.priority$tunable.params
  Parameters[[outcome]] <- tuned.parameters
  
  # Compute a prioritization based on estimated treatment effects.
  priority.cate <- predict(cf.priority, X[-train,])$predictions
  
  # Estimate AUTOC on held out data using tuned parameters from the priority model
  cf.eval <- causal_forest(X[-train, ], Y[-train], W[-train], seed = 777, num.trees = 2000,
                               sample.fraction = 0.5,
                               min.node.size = 5,
                               honesty.fraction = 0.5,
                               mtry = 26,
                               honesty.prune.leaves = 1,
                               alpha = 0.05,
                               imbalance.penalty = 0,
                               tune.num.trees = 200)

  # Estimate AUTOC
  rate <- rank_average_treatment_effect(cf.eval, priority.cate)
  RATEs[[outcome]] <- rate
  
  t.stat = rate$estimate / rate$std.err
  # Compute a two-sided p-value Pr(>|t|)
  p.val = 2 * pnorm(-abs(t.stat))
  Pvals[[outcome]] <- p.val

  # Plot the Targeting Operator Characteristic curve
  rate_plot <- plot(rate, las=1)
  print(rate_plot)

  # Add title to plot "TOC Curve for outcome"
  title(paste("TOC Curve for", outcome), line = -2, cex.main = 0.8)
}
  return(list(RATEs = RATEs, Parameters = Parameters, Pvals = Pvals))
}
outcomes <- c("misinfo_post", "base_rate_post", "new_disc_post")

Reasoning vs Facts Baseline

rate_RvF <- run_RATE(outcomes, "treatmentReasoning", wide %>% filter(treatment %in% c("Reasoning", "Facts Baseline")))
## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

Emotions vs Facts Baseline

rate_EvF <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Emotions", "Facts Baseline")))
## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

Combo vs Facts Baseline

rate_CvF <- run_RATE(outcomes, "treatmentCombo", wide %>% filter(treatment %in% c("Combo", "Facts Baseline")))
## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

Emotions vs Reasoning

rate_EvR <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Reasoning", "Emotions")))
## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

Emotions vs Combo

rate_EvC <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Emotions", "Combo")))
## [1] "Outcome: misinfo_post"

## NULL
## [1] "Outcome: base_rate_post"

## NULL
## [1] "Outcome: new_disc_post"

## NULL

RATE output

RATE is done by sample splitting, i.e. using half of the datasets to fit a causal forest and the other half as evaluation dataset to generate RATE.

The RATE outputs the estimates, standard errors, and p-values for the RATEs for each outcome and comparison combination.

Note that numbers within () are standard errors, and numbers within [] are two sided p-values.

# organizing table
RATE_output <- matrix(NA, nrow = 15, ncol = 4)

RATE_output[1, ] <- c("Reasoning vs Facts Baseline", rate_RvF$RATEs$misinfo_post$estimate, rate_RvF$RATEs$base_rate_post$estimate, rate_RvF$RATEs$new_disc_post$estimate)
RATE_output[2, ]  <- c("", rate_RvF$RATEs$misinfo_post$std.err, rate_RvF$RATEs$base_rate_post$std.err, rate_RvF$RATEs$new_disc_post$std.err)
RATE_output[3, ] <- c("", rate_RvF$Pvals$misinfo_post, rate_RvF$Pvals$base_rate_post, rate_RvF$Pvals$new_disc_post)
RATE_output[4, ] <- c("Emotions vs Facts Baseline", rate_EvF$RATEs$misinfo_post$estimate, rate_EvF$RATEs$base_rate_post$estimate, rate_EvF$RATEs$new_disc_post$estimate)
RATE_output[5, ]  <- c(" ", rate_EvF$RATEs$misinfo_post$std.err, rate_EvF$RATEs$base_rate_post$std.err, rate_EvF$RATEs$new_disc_post$std.err)
RATE_output[6, ] <- c(" ", rate_EvF$Pvals$misinfo_post, rate_EvF$Pvals$base_rate_post, rate_EvF$Pvals$new_disc_post)
RATE_output[7, ] <- c("Combo vs Facts Baseline", rate_CvF$RATEs$misinfo_post$estimate, rate_CvF$RATEs$base_rate_post$estimate, rate_CvF$RATEs$new_disc_post$estimate)
RATE_output[8, ]  <- c("  ", rate_CvF$RATEs$misinfo_post$std.err, rate_CvF$RATEs$base_rate_post$std.err, rate_CvF$RATEs$new_disc_post$std.err)
RATE_output[9, ] <- c(" ", rate_CvF$Pvals$misinfo_post, rate_CvF$Pvals$base_rate_post, rate_CvF$Pvals$new_disc_post)
RATE_output[10, ] <- c("Emotions vs Reasoning", rate_EvR$RATEs$misinfo_post$estimate, rate_EvR$RATEs$base_rate_post$estimate, rate_EvR$RATEs$new_disc_post$estimate)
RATE_output[11, ]  <- c("   ", rate_EvR$RATEs$misinfo_post$std.err, rate_EvR$RATEs$base_rate_post$std.err, rate_EvR$RATEs$new_disc_post$std.err)
RATE_output[12, ] <- c("   ", rate_EvR$Pvals$misinfo_post, rate_EvR$Pvals$base_rate_post, rate_EvR$Pvals$new_disc_post)
RATE_output[13, ] <- c("Emotions vs Combo", rate_EvC$RATEs$misinfo_post$estimate, rate_EvC$RATEs$base_rate_post$estimate, rate_EvC$RATEs$new_disc_post$estimate)
RATE_output[14, ]  <- c("     ", rate_EvC$RATEs$misinfo_post$std.err, rate_EvC$RATEs$base_rate_post$std.err, rate_EvC$RATEs$new_disc_post$std.err)
RATE_output[15, ] <- c("     ", rate_EvC$Pvals$misinfo_post, rate_EvC$Pvals$base_rate_post, rate_EvC$Pvals$new_disc_post)

# round to four digits for column 2 to 4 
RATE_output[, 2:4] <- round(as.numeric(RATE_output[, 2:4]), 4)
colnames(RATE_output) <- c("Comparison", "Misinformation Sharing", "Non-misinformation Sharing", "Sharing Discernment")

# round to 4 digits row 1, 4, 7, 10, 13
RATE_output[(1:5) * 3 - 2, 2:4] <- sapply(RATE_output[(1:5) * 3 - 2, 2:4], function(x) round(as.numeric(x), 4))
# for row 2, 5, 8, 11, 14, add () to the values
RATE_output[(1:5) * 3 - 1, 2:4] <- sapply(RATE_output[(1:5) * 3 - 1, 2:4], function(x) paste0("(", round(as.numeric(x), 4), ")"))
# for row 3, 6, 9, 12, 15, add [] to the values
RATE_output[(1:5) * 3, 2:4] <- sapply(RATE_output[(1:5) * 3, 2:4], function(x) paste0("[", round(as.numeric(x), 4), "]"))
# saveRDS(RATE_output, "../Cleaned_data/7_RATE_output.RDS")

RATE_output <- as.data.frame(RATE_output)
kable(RATE_output, format = "html", digits = 4) %>% 
  column_spec(1, bold = TRUE) %>%
  column_spec(2:4, background = "#f7f7f7") %>%
  kable_styling(position = "left", latex_options = c("striped", "hold_position"))
Comparison Misinformation Sharing Non-misinformation Sharing Sharing Discernment
Reasoning vs Facts Baseline 0.0478 0.0199 0.0193
(0.0121) (0.0153) (0.0133)
[0.0001] [0.1931] [0.1473]
Emotions vs Facts Baseline 0.06 0.0279 0.0137
(0.0133) (0.013) (0.014)
[0] [0.032] [0.3298]
Combo vs Facts Baseline 0.0264 0.0193 0.0059
(0.0129) (0.0147) (0.0162)
[0.0403] [0.1896] [0.7175]
Emotions vs Reasoning 0.0238 0.0116 -0.01
(0.0131) (0.016) (0.0148)
[0.0687] [0.4713] [0.4995]
Emotions vs Combo -0.0101 0.0063 0.0116
(0.0132) (0.0148) (0.0144)
[0.4442] [0.6702] [0.4202]