options(warn=-1)
options(scipen=999)
suppressMessages(library(estimatr))
suppressMessages(library(stats))
suppressMessages(library(tidyverse))
suppressMessages(library(rmarkdown))
suppressMessages(library(tictoc))
suppressMessages(library(stargazer))
suppressMessages(library(ggpattern))
suppressMessages(library(doParallel))
suppressMessages(library(mltools))
suppressMessages(library(data.table))
suppressMessages(library(RItools))
suppressMessages(library(stringr))
suppressMessages(library(grf))
suppressMessages(library(multcomp))
suppressMessages(library(msm))
suppressMessages(library(lmtest))
suppressMessages(library(fixest))
suppressMessages(library(sandwich))
suppressMessages(library(kableExtra))
nrCores = detectCores()
cl = makeCluster(nrCores)
registerDoParallel(cl)
## [[1]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[8]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "sandwich" "fixest" "lmtest" "zoo" "msm" "multcomp"
## [7] "TH.data" "MASS" "survival" "mvtnorm" "rmarkdown" "lubridate"
## [13] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [19] "tibble" "ggplot2" "tidyverse" "estimatr" "stats" "graphics"
## [25] "grDevices" "utils" "datasets" "methods" "base"
data = read_rds("../Cleaned_data/intermediate_data_wide.rds")
wide <- data
# set up treatment variables for each course
wide$treatmentEmotions <- ifelse(wide$treatment == "Emotions", 1, 0)
wide$treatmentReasoning <- ifelse(wide$treatment == "Reasoning", 1, 0)
wide$treatmentCombo <- ifelse(wide$treatment == "Combo", 1, 0)
wide$treatmentFactsBaseline <- ifelse(wide$treatment == "Facts Baseline", 1, 0)
# Create vectors with names and display names of covariates by type to use later #
covariates_continuous = c("age","social_media_hours","base_rate_pre","base_total_acc_score_pre",
"misinfo_pre","misinfo_total_acc_score_pre")
covariates_categorical = c("gender_Man","education_High_school_or_less","marital_Married_or_in_a_domestic_partnership", "employment_Employed", "location_Mostly_urban","religion_Christian","religiosity_Attends","social_media_share_above_40","att_check_pre")
covariates <- c("age","gender_Man", "education_High_school_or_less",
"education_Some_college","education_Bachelor_degree","education_Graduate_degree",
"marital_Married_or_in_a_domestic_partnership",
"employment_Unemployed","employment_Employed","employment_Student",
"location_Mostly_urban","location_Suburban","location_Mostly_rural",
"religion_Christian","religiosity_Attends",
"social_media_bin_Yes","social_media_hours",
"social_media_share_0_20",
"social_media_share_20_40",
"social_media_share_40_60",
"social_media_share_60_80",
"social_media_share_80_100")
covariates_sharing <- c("misinfo_pre","base_rate_pre")
covariates_accuracy <- c("misinfo_total_acc_score_pre","base_total_acc_score_pre")
covariates_all <- c(covariates,covariates_sharing,covariates_accuracy)
covariates_continuous_fancy = c("Age","Hrs/day on social media",
"All posts (Sharing)",
"Pre-Survey Misinformation posts (Sharing)",
"Pre-Survey Non-misinformation posts (Sharing)",
"All posts (Accuracy Score)",
"Pre-Survey Misinformation posts (Accuracy Score)",
"Pre-Survey Non-misinformation posts (Accuracy Score)")
covariates_binary_fancy = c("Man",
"High school or less",
"Some college",
"Bachelor\'s degree",
"Graduate degree",
"Married",
"Unemployed",
"Employed",
"Student",
"Mostly urban","Suburban","Mostly rural",
"Christian",
"Attends religious services",
"Uses social media",
"Social media sharing behavior 0-20%",
"Social media sharing behavior 20-40%",
"Social media sharing behavior 40-60%",
"Social media sharing behavior 60-80%",
"Social media sharing behavior 80-100%")
covariates_all_fancy = c("Age",covariates_binary_fancy[!endsWith(covariates_binary_fancy,"%")],"Hrs/day on social media",
covariates_binary_fancy[endsWith(covariates_binary_fancy,"%")],
covariates_continuous_fancy[c(4, 5, 7, 8)])
# generate mapping between covariates_all and covariates_all_fancy
variable_mapping <- covariates_all_fancy
names(variable_mapping) <- covariates_all
This script primarily focuses on: Rank-Weighted Average Treatment Effect
We provide the following comparison.
Given the priority is replicability, we explicitly use the default parameters in all causal forests specification.
sample.fraction
= 0.5min.node.size
= 5honesty.fraction
= 0.5mtry
= 26honesty.prune.leaves
= 1alpha
= 0.05imbalance.penalty
= 0num.tree
= 2000tune.num.trees
= 200run_RATE <- function(outcome, treatment, wide_treatment){
# Set tuning parameters
# num.trees <- 3000
# mtry <- 50
# honesty.prune.leaves <- FALSE
# tune.parameters <- c("sample.fraction", "min.node.size", "honesty.fraction")
# tune.num.trees <- 300
set.seed(777)
RATEs <- list()
Parameters <- list()
Pvals <- list()
# Loop over all outcomes Y
for (outcome in outcomes) {
# Print Outcome and Covariates at the top of each loop
print(paste("Outcome:", outcome))
# Define Y to be the outcome
Y <- wide_treatment[,outcome]
# Define X to be the matrix
fmla <- formula(paste0("~ 0 + ", paste0(covariates_all, collapse="+")))
X <- model.matrix(fmla, wide_treatment)
# Define W to be the treatment
W <- wide_treatment[,treatment]
# Train a causal forest to estimate a CATE based priority ranking
train <- sample(1:nrow(X), nrow(X) / 2)
cf.priority <- causal_forest(X[train,], Y[train], W[train], seed = 777, num.trees = 2000,
sample.fraction = 0.5,
min.node.size = 5,
honesty.fraction = 0.5,
mtry = 26,
honesty.prune.leaves = 1,
alpha = 0.05,
imbalance.penalty = 0,
tune.num.trees = 200)
tuned.parameters <- cf.priority$tunable.params
Parameters[[outcome]] <- tuned.parameters
# Compute a prioritization based on estimated treatment effects.
priority.cate <- predict(cf.priority, X[-train,])$predictions
# Estimate AUTOC on held out data using tuned parameters from the priority model
cf.eval <- causal_forest(X[-train, ], Y[-train], W[-train], seed = 777, num.trees = 2000,
sample.fraction = 0.5,
min.node.size = 5,
honesty.fraction = 0.5,
mtry = 26,
honesty.prune.leaves = 1,
alpha = 0.05,
imbalance.penalty = 0,
tune.num.trees = 200)
# Estimate AUTOC
rate <- rank_average_treatment_effect(cf.eval, priority.cate)
RATEs[[outcome]] <- rate
t.stat = rate$estimate / rate$std.err
# Compute a two-sided p-value Pr(>|t|)
p.val = 2 * pnorm(-abs(t.stat))
Pvals[[outcome]] <- p.val
# Plot the Targeting Operator Characteristic curve
rate_plot <- plot(rate, las=1)
print(rate_plot)
# Add title to plot "TOC Curve for outcome"
title(paste("TOC Curve for", outcome), line = -2, cex.main = 0.8)
}
return(list(RATEs = RATEs, Parameters = Parameters, Pvals = Pvals))
}
outcomes <- c("misinfo_post", "base_rate_post", "new_disc_post")
rate_RvF <- run_RATE(outcomes, "treatmentReasoning", wide %>% filter(treatment %in% c("Reasoning", "Facts Baseline")))
## [1] "Outcome: misinfo_post"
## NULL
## [1] "Outcome: base_rate_post"
## NULL
## [1] "Outcome: new_disc_post"
## NULL
rate_EvF <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Emotions", "Facts Baseline")))
## [1] "Outcome: misinfo_post"
## NULL
## [1] "Outcome: base_rate_post"
## NULL
## [1] "Outcome: new_disc_post"
## NULL
rate_CvF <- run_RATE(outcomes, "treatmentCombo", wide %>% filter(treatment %in% c("Combo", "Facts Baseline")))
## [1] "Outcome: misinfo_post"
## NULL
## [1] "Outcome: base_rate_post"
## NULL
## [1] "Outcome: new_disc_post"
## NULL
rate_EvR <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Reasoning", "Emotions")))
## [1] "Outcome: misinfo_post"
## NULL
## [1] "Outcome: base_rate_post"
## NULL
## [1] "Outcome: new_disc_post"
## NULL
rate_EvC <- run_RATE(outcomes, "treatmentEmotions", wide %>% filter(treatment %in% c("Emotions", "Combo")))
## [1] "Outcome: misinfo_post"
## NULL
## [1] "Outcome: base_rate_post"
## NULL
## [1] "Outcome: new_disc_post"
## NULL
RATE is done by sample splitting, i.e. using half of the datasets to fit a causal forest and the other half as evaluation dataset to generate RATE.
The RATE outputs the estimates, standard errors, and p-values for the RATEs for each outcome and comparison combination.
Note that numbers within () are standard errors, and numbers within [] are two sided p-values.
# organizing table
RATE_output <- matrix(NA, nrow = 15, ncol = 4)
RATE_output[1, ] <- c("Reasoning vs Facts Baseline", rate_RvF$RATEs$misinfo_post$estimate, rate_RvF$RATEs$base_rate_post$estimate, rate_RvF$RATEs$new_disc_post$estimate)
RATE_output[2, ] <- c("", rate_RvF$RATEs$misinfo_post$std.err, rate_RvF$RATEs$base_rate_post$std.err, rate_RvF$RATEs$new_disc_post$std.err)
RATE_output[3, ] <- c("", rate_RvF$Pvals$misinfo_post, rate_RvF$Pvals$base_rate_post, rate_RvF$Pvals$new_disc_post)
RATE_output[4, ] <- c("Emotions vs Facts Baseline", rate_EvF$RATEs$misinfo_post$estimate, rate_EvF$RATEs$base_rate_post$estimate, rate_EvF$RATEs$new_disc_post$estimate)
RATE_output[5, ] <- c(" ", rate_EvF$RATEs$misinfo_post$std.err, rate_EvF$RATEs$base_rate_post$std.err, rate_EvF$RATEs$new_disc_post$std.err)
RATE_output[6, ] <- c(" ", rate_EvF$Pvals$misinfo_post, rate_EvF$Pvals$base_rate_post, rate_EvF$Pvals$new_disc_post)
RATE_output[7, ] <- c("Combo vs Facts Baseline", rate_CvF$RATEs$misinfo_post$estimate, rate_CvF$RATEs$base_rate_post$estimate, rate_CvF$RATEs$new_disc_post$estimate)
RATE_output[8, ] <- c(" ", rate_CvF$RATEs$misinfo_post$std.err, rate_CvF$RATEs$base_rate_post$std.err, rate_CvF$RATEs$new_disc_post$std.err)
RATE_output[9, ] <- c(" ", rate_CvF$Pvals$misinfo_post, rate_CvF$Pvals$base_rate_post, rate_CvF$Pvals$new_disc_post)
RATE_output[10, ] <- c("Emotions vs Reasoning", rate_EvR$RATEs$misinfo_post$estimate, rate_EvR$RATEs$base_rate_post$estimate, rate_EvR$RATEs$new_disc_post$estimate)
RATE_output[11, ] <- c(" ", rate_EvR$RATEs$misinfo_post$std.err, rate_EvR$RATEs$base_rate_post$std.err, rate_EvR$RATEs$new_disc_post$std.err)
RATE_output[12, ] <- c(" ", rate_EvR$Pvals$misinfo_post, rate_EvR$Pvals$base_rate_post, rate_EvR$Pvals$new_disc_post)
RATE_output[13, ] <- c("Emotions vs Combo", rate_EvC$RATEs$misinfo_post$estimate, rate_EvC$RATEs$base_rate_post$estimate, rate_EvC$RATEs$new_disc_post$estimate)
RATE_output[14, ] <- c(" ", rate_EvC$RATEs$misinfo_post$std.err, rate_EvC$RATEs$base_rate_post$std.err, rate_EvC$RATEs$new_disc_post$std.err)
RATE_output[15, ] <- c(" ", rate_EvC$Pvals$misinfo_post, rate_EvC$Pvals$base_rate_post, rate_EvC$Pvals$new_disc_post)
# round to four digits for column 2 to 4
RATE_output[, 2:4] <- round(as.numeric(RATE_output[, 2:4]), 4)
colnames(RATE_output) <- c("Comparison", "Misinformation Sharing", "Non-misinformation Sharing", "Sharing Discernment")
# round to 4 digits row 1, 4, 7, 10, 13
RATE_output[(1:5) * 3 - 2, 2:4] <- sapply(RATE_output[(1:5) * 3 - 2, 2:4], function(x) round(as.numeric(x), 4))
# for row 2, 5, 8, 11, 14, add () to the values
RATE_output[(1:5) * 3 - 1, 2:4] <- sapply(RATE_output[(1:5) * 3 - 1, 2:4], function(x) paste0("(", round(as.numeric(x), 4), ")"))
# for row 3, 6, 9, 12, 15, add [] to the values
RATE_output[(1:5) * 3, 2:4] <- sapply(RATE_output[(1:5) * 3, 2:4], function(x) paste0("[", round(as.numeric(x), 4), "]"))
# saveRDS(RATE_output, "../Cleaned_data/7_RATE_output.RDS")
RATE_output <- as.data.frame(RATE_output)
kable(RATE_output, format = "html", digits = 4) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:4, background = "#f7f7f7") %>%
kable_styling(position = "left", latex_options = c("striped", "hold_position"))
Comparison | Misinformation Sharing | Non-misinformation Sharing | Sharing Discernment |
---|---|---|---|
Reasoning vs Facts Baseline | 0.0478 | 0.0199 | 0.0193 |
(0.0121) | (0.0153) | (0.0133) | |
[0.0001] | [0.1931] | [0.1473] | |
Emotions vs Facts Baseline | 0.06 | 0.0279 | 0.0137 |
(0.0133) | (0.013) | (0.014) | |
[0] | [0.032] | [0.3298] | |
Combo vs Facts Baseline | 0.0264 | 0.0193 | 0.0059 |
(0.0129) | (0.0147) | (0.0162) | |
[0.0403] | [0.1896] | [0.7175] | |
Emotions vs Reasoning | 0.0238 | 0.0116 | -0.01 |
(0.0131) | (0.016) | (0.0148) | |
[0.0687] | [0.4713] | [0.4995] | |
Emotions vs Combo | -0.0101 | 0.0063 | 0.0116 |
(0.0132) | (0.0148) | (0.0144) | |
[0.4442] | [0.6702] | [0.4202] |